LLVM 22.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarry())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
246 bool tryConstantFoldOp(MachineInstr *MI) const;
247 bool tryFoldCndMask(MachineInstr &MI) const;
248 bool tryFoldZeroHighBits(MachineInstr &MI) const;
249 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
250
251 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
252 bool tryFoldFoldableCopy(MachineInstr &MI,
253 MachineOperand *&CurrentKnownM0Val) const;
254
255 const MachineOperand *isClamp(const MachineInstr &MI) const;
256 bool tryFoldClamp(MachineInstr &MI);
257
258 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
259 bool tryFoldOMod(MachineInstr &MI);
260 bool tryFoldRegSequence(MachineInstr &MI);
261 bool tryFoldPhiAGPR(MachineInstr &MI);
262 bool tryFoldLoad(MachineInstr &MI);
263
264 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
265
266public:
267 SIFoldOperandsImpl() = default;
268
269 bool run(MachineFunction &MF);
270};
271
272class SIFoldOperandsLegacy : public MachineFunctionPass {
273public:
274 static char ID;
275
276 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
277
278 bool runOnMachineFunction(MachineFunction &MF) override {
279 if (skipFunction(MF.getFunction()))
280 return false;
281 return SIFoldOperandsImpl().run(MF);
282 }
283
284 StringRef getPassName() const override { return "SI Fold Operands"; }
285
286 void getAnalysisUsage(AnalysisUsage &AU) const override {
287 AU.setPreservesCFG();
289 }
290
291 MachineFunctionProperties getRequiredProperties() const override {
292 return MachineFunctionProperties().setIsSSA();
293 }
294};
295
296} // End anonymous namespace.
297
298INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
299 false)
300
301char SIFoldOperandsLegacy::ID = 0;
302
303char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
304
307 const MachineOperand &MO) {
308 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
309 if (const TargetRegisterClass *SubRC =
310 TRI.getSubRegisterClass(RC, MO.getSubReg()))
311 RC = SubRC;
312 return RC;
313}
314
315// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
316static unsigned macToMad(unsigned Opc) {
317 switch (Opc) {
318 case AMDGPU::V_MAC_F32_e64:
319 return AMDGPU::V_MAD_F32_e64;
320 case AMDGPU::V_MAC_F16_e64:
321 return AMDGPU::V_MAD_F16_e64;
322 case AMDGPU::V_FMAC_F32_e64:
323 return AMDGPU::V_FMA_F32_e64;
324 case AMDGPU::V_FMAC_F16_e64:
325 return AMDGPU::V_FMA_F16_gfx9_e64;
326 case AMDGPU::V_FMAC_F16_t16_e64:
327 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
328 case AMDGPU::V_FMAC_F16_fake16_e64:
329 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
330 case AMDGPU::V_FMAC_LEGACY_F32_e64:
331 return AMDGPU::V_FMA_LEGACY_F32_e64;
332 case AMDGPU::V_FMAC_F64_e64:
333 return AMDGPU::V_FMA_F64_e64;
334 }
335 return AMDGPU::INSTRUCTION_LIST_END;
336}
337
338// TODO: Add heuristic that the frame index might not fit in the addressing mode
339// immediate offset to avoid materializing in loops.
340bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
341 const FoldableDef &OpToFold) const {
342 if (!OpToFold.isFI())
343 return false;
344
345 const unsigned Opc = UseMI.getOpcode();
346 switch (Opc) {
347 case AMDGPU::S_ADD_I32:
348 case AMDGPU::S_ADD_U32:
349 case AMDGPU::V_ADD_U32_e32:
350 case AMDGPU::V_ADD_CO_U32_e32:
351 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
352 // to insert the wave size shift at every point we use the index.
353 // TODO: Fix depending on visit order to fold immediates into the operand
354 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
355 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
356 case AMDGPU::V_ADD_U32_e64:
357 case AMDGPU::V_ADD_CO_U32_e64:
358 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
359 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
360 default:
361 break;
362 }
363
364 if (TII->isMUBUF(UseMI))
365 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
366 if (!TII->isFLATScratch(UseMI))
367 return false;
368
369 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
370 if (OpNo == SIdx)
371 return true;
372
373 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
374 return OpNo == VIdx && SIdx == -1;
375}
376
377/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
378///
379/// => %vgpr = V_ADD_U32 x, frameindex
380bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
381 Register DstReg, Register SrcReg, MachineInstr &MI) const {
382 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
383 MRI->hasOneNonDBGUse(SrcReg)) {
384 MachineInstr *Def = MRI->getVRegDef(SrcReg);
385 if (!Def || Def->getNumOperands() != 4)
386 return false;
387
388 MachineOperand *Src0 = &Def->getOperand(1);
389 MachineOperand *Src1 = &Def->getOperand(2);
390
391 // TODO: This is profitable with more operand types, and for more
392 // opcodes. But ultimately this is working around poor / nonexistent
393 // regbankselect.
394 if (!Src0->isFI() && !Src1->isFI())
395 return false;
396
397 if (Src0->isFI())
398 std::swap(Src0, Src1);
399
400 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
401 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
402 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
403 !Def->getOperand(3).isDead()) // Check if scc is dead
404 return false;
405
406 MachineBasicBlock *MBB = Def->getParent();
407 const DebugLoc &DL = Def->getDebugLoc();
408 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
409 MachineInstrBuilder Add =
410 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
411
412 if (Add->getDesc().getNumDefs() == 2) {
413 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
414 Add.addDef(CarryOutReg, RegState::Dead);
415 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
416 }
417
418 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
419 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
420 Add.addImm(0);
421
422 Def->eraseFromParent();
423 MI.eraseFromParent();
424 return true;
425 }
426
427 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
428
430 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
431 if (Liveness == MachineBasicBlock::LQR_Dead) {
432 // TODO: If src1 satisfies operand constraints, use vop3 version.
433 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
434 .add(*Src0)
435 .add(*Src1)
436 .setOperandDead(3) // implicit-def $vcc
437 .setMIFlags(Def->getFlags());
438 Def->eraseFromParent();
439 MI.eraseFromParent();
440 return true;
441 }
442 }
443
444 return false;
445}
446
448 return new SIFoldOperandsLegacy();
449}
450
451bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
452 unsigned UseOpNo,
453 int64_t ImmVal) const {
454 const uint64_t TSFlags = MI->getDesc().TSFlags;
455
456 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
457 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
458 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
459 return false;
460
461 const MachineOperand &Old = MI->getOperand(UseOpNo);
462 int OpNo = MI->getOperandNo(&Old);
463
464 unsigned Opcode = MI->getOpcode();
465 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
466 switch (OpType) {
467 default:
468 return false;
476 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
477 // two different constants.
478 if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
479 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
480 return false;
481 break;
482 }
483
484 return true;
485}
486
487bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
488 int64_t ImmVal) const {
489 MachineOperand &Old = MI->getOperand(UseOpNo);
490 unsigned Opcode = MI->getOpcode();
491 int OpNo = MI->getOperandNo(&Old);
492 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
493
494 // If the literal can be inlined as-is, apply it and short-circuit the
495 // tests below. The main motivation for this is to avoid unintuitive
496 // uses of opsel.
497 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
498 Old.ChangeToImmediate(ImmVal);
499 return true;
500 }
501
502 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
503 // op_sel in a way that allows an inline constant.
504 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
505 unsigned SrcIdx = ~0;
506 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
507 ModName = AMDGPU::OpName::src0_modifiers;
508 SrcIdx = 0;
509 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
510 ModName = AMDGPU::OpName::src1_modifiers;
511 SrcIdx = 1;
512 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
513 ModName = AMDGPU::OpName::src2_modifiers;
514 SrcIdx = 2;
515 }
516 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
517 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
518 MachineOperand &Mod = MI->getOperand(ModIdx);
519 unsigned ModVal = Mod.getImm();
520
521 uint16_t ImmLo =
522 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
523 uint16_t ImmHi =
524 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
525 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
526 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
527
528 // Helper function that attempts to inline the given value with a newly
529 // chosen opsel pattern.
530 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
531 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
532 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
533 Old.ChangeToImmediate(Imm);
534 return true;
535 }
536
537 // Try to shuffle the halves around and leverage opsel to get an inline
538 // constant.
539 uint16_t Lo = static_cast<uint16_t>(Imm);
540 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
541 if (Lo == Hi) {
542 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
543 Mod.setImm(NewModVal);
545 return true;
546 }
547
548 if (static_cast<int16_t>(Lo) < 0) {
549 int32_t SExt = static_cast<int16_t>(Lo);
550 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
551 Mod.setImm(NewModVal);
552 Old.ChangeToImmediate(SExt);
553 return true;
554 }
555 }
556
557 // This check is only useful for integer instructions
558 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
559 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
560 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
561 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
562 return true;
563 }
564 }
565 } else {
566 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
567 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
568 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
569 Old.ChangeToImmediate(Swapped);
570 return true;
571 }
572 }
573
574 return false;
575 };
576
577 if (tryFoldToInline(Imm))
578 return true;
579
580 // Replace integer addition by subtraction and vice versa if it allows
581 // folding the immediate to an inline constant.
582 //
583 // We should only ever get here for SrcIdx == 1 due to canonicalization
584 // earlier in the pipeline, but we double-check here to be safe / fully
585 // general.
586 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
587 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
588 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
589 unsigned ClampIdx =
590 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
591 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
592
593 if (!Clamp) {
594 uint16_t NegLo = -static_cast<uint16_t>(Imm);
595 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
596 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
597
598 if (tryFoldToInline(NegImm)) {
599 unsigned NegOpcode =
600 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
601 MI->setDesc(TII->get(NegOpcode));
602 return true;
603 }
604 }
605 }
606
607 return false;
608}
609
610bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
611 MachineInstr *MI = Fold.UseMI;
612 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
613 assert(Old.isReg());
614
615 std::optional<int64_t> ImmVal;
616 if (Fold.isImm())
617 ImmVal = Fold.Def.getEffectiveImmVal();
618
619 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
620 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
621 return true;
622
623 // We can't represent the candidate as an inline constant. Try as a literal
624 // with the original opsel, checking constant bus limitations.
625 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
626 int OpNo = MI->getOperandNo(&Old);
627 if (!TII->isOperandLegal(*MI, OpNo, &New))
628 return false;
629 Old.ChangeToImmediate(*ImmVal);
630 return true;
631 }
632
633 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
634 MachineBasicBlock *MBB = MI->getParent();
635 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
636 if (Liveness != MachineBasicBlock::LQR_Dead) {
637 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
638 return false;
639 }
640
641 int Op32 = Fold.ShrinkOpcode;
642 MachineOperand &Dst0 = MI->getOperand(0);
643 MachineOperand &Dst1 = MI->getOperand(1);
644 assert(Dst0.isDef() && Dst1.isDef());
645
646 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
647
648 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
649 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
650
651 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
652
653 if (HaveNonDbgCarryUse) {
654 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
655 Dst1.getReg())
656 .addReg(AMDGPU::VCC, RegState::Kill);
657 }
658
659 // Keep the old instruction around to avoid breaking iterators, but
660 // replace it with a dummy instruction to remove uses.
661 //
662 // FIXME: We should not invert how this pass looks at operands to avoid
663 // this. Should track set of foldable movs instead of looking for uses
664 // when looking at a use.
665 Dst0.setReg(NewReg0);
666 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
667 MI->removeOperand(I);
668 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
669
670 if (Fold.Commuted)
671 TII->commuteInstruction(*Inst32, false);
672 return true;
673 }
674
675 assert(!Fold.needsShrink() && "not handled");
676
677 if (ImmVal) {
678 if (Old.isTied()) {
679 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
680 if (NewMFMAOpc == -1)
681 return false;
682 MI->setDesc(TII->get(NewMFMAOpc));
683 MI->untieRegOperand(0);
684 }
685
686 // TODO: Should we try to avoid adding this to the candidate list?
687 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
688 int OpNo = MI->getOperandNo(&Old);
689 if (!TII->isOperandLegal(*MI, OpNo, &New))
690 return false;
691
692 Old.ChangeToImmediate(*ImmVal);
693 return true;
694 }
695
696 if (Fold.isGlobal()) {
697 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
698 Fold.Def.OpToFold->getOffset(),
699 Fold.Def.OpToFold->getTargetFlags());
700 return true;
701 }
702
703 if (Fold.isFI()) {
704 Old.ChangeToFrameIndex(Fold.getFI());
705 return true;
706 }
707
708 MachineOperand *New = Fold.Def.OpToFold;
709
710 // Verify the register is compatible with the operand.
711 if (const TargetRegisterClass *OpRC =
712 TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
713 const TargetRegisterClass *NewRC =
714 TRI->getRegClassForReg(*MRI, New->getReg());
715
716 const TargetRegisterClass *ConstrainRC = OpRC;
717 if (New->getSubReg()) {
718 ConstrainRC =
719 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
720
721 if (!ConstrainRC)
722 return false;
723 }
724
725 if (New->getReg().isVirtual() &&
726 !MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
727 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
728 << TRI->getRegClassName(ConstrainRC) << '\n');
729 return false;
730 }
731 }
732
733 // Rework once the VS_16 register class is updated to include proper
734 // 16-bit SGPRs instead of 32-bit ones.
735 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
736 Old.setSubReg(AMDGPU::NoSubRegister);
737 if (New->getReg().isPhysical()) {
738 Old.substPhysReg(New->getReg(), *TRI);
739 } else {
740 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
741 Old.setIsUndef(New->isUndef());
742 }
743 return true;
744}
745
747 FoldCandidate &&Entry) {
748 // Skip additional folding on the same operand.
749 for (FoldCandidate &Fold : FoldList)
750 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
751 return;
752 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
753 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
754 FoldList.push_back(Entry);
755}
756
758 MachineInstr *MI, unsigned OpNo,
759 const FoldableDef &FoldOp,
760 bool Commuted = false, int ShrinkOp = -1) {
761 appendFoldCandidate(FoldList,
762 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
763}
764
765bool SIFoldOperandsImpl::tryAddToFoldList(
766 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
767 const FoldableDef &OpToFold) const {
768 const unsigned Opc = MI->getOpcode();
769
770 auto tryToFoldAsFMAAKorMK = [&]() {
771 if (!OpToFold.isImm())
772 return false;
773
774 const bool TryAK = OpNo == 3;
775 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
776 MI->setDesc(TII->get(NewOpc));
777
778 // We have to fold into operand which would be Imm not into OpNo.
779 bool FoldAsFMAAKorMK =
780 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
781 if (FoldAsFMAAKorMK) {
782 // Untie Src2 of fmac.
783 MI->untieRegOperand(3);
784 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
785 if (OpNo == 1) {
786 MachineOperand &Op1 = MI->getOperand(1);
787 MachineOperand &Op2 = MI->getOperand(2);
788 Register OldReg = Op1.getReg();
789 // Operand 2 might be an inlinable constant
790 if (Op2.isImm()) {
791 Op1.ChangeToImmediate(Op2.getImm());
792 Op2.ChangeToRegister(OldReg, false);
793 } else {
794 Op1.setReg(Op2.getReg());
795 Op2.setReg(OldReg);
796 }
797 }
798 return true;
799 }
800 MI->setDesc(TII->get(Opc));
801 return false;
802 };
803
804 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
805 if (!IsLegal && OpToFold.isImm()) {
806 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
807 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
808 }
809
810 if (!IsLegal) {
811 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
812 unsigned NewOpc = macToMad(Opc);
813 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
814 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
815 // to fold the operand.
816 MI->setDesc(TII->get(NewOpc));
817 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
818 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
819 if (AddOpSel)
820 MI->addOperand(MachineOperand::CreateImm(0));
821 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
822 if (FoldAsMAD) {
823 MI->untieRegOperand(OpNo);
824 return true;
825 }
826 if (AddOpSel)
827 MI->removeOperand(MI->getNumExplicitOperands() - 1);
828 MI->setDesc(TII->get(Opc));
829 }
830
831 // Special case for s_fmac_f32 if we are trying to fold into Src2.
832 // By transforming into fmaak we can untie Src2 and make folding legal.
833 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
834 if (tryToFoldAsFMAAKorMK())
835 return true;
836 }
837
838 // Special case for s_setreg_b32
839 if (OpToFold.isImm()) {
840 unsigned ImmOpc = 0;
841 if (Opc == AMDGPU::S_SETREG_B32)
842 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
843 else if (Opc == AMDGPU::S_SETREG_B32_mode)
844 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
845 if (ImmOpc) {
846 MI->setDesc(TII->get(ImmOpc));
847 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
848 return true;
849 }
850 }
851
852 // Operand is not legal, so try to commute the instruction to
853 // see if this makes it possible to fold.
854 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
855 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
856 if (!CanCommute)
857 return false;
858
859 MachineOperand &Op = MI->getOperand(OpNo);
860 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
861
862 // One of operands might be an Imm operand, and OpNo may refer to it after
863 // the call of commuteInstruction() below. Such situations are avoided
864 // here explicitly as OpNo must be a register operand to be a candidate
865 // for memory folding.
866 if (!Op.isReg() || !CommutedOp.isReg())
867 return false;
868
869 // The same situation with an immediate could reproduce if both inputs are
870 // the same register.
871 if (Op.isReg() && CommutedOp.isReg() &&
872 (Op.getReg() == CommutedOp.getReg() &&
873 Op.getSubReg() == CommutedOp.getSubReg()))
874 return false;
875
876 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
877 return false;
878
879 int Op32 = -1;
880 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
881 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
882 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
883 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
884 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
885 return false;
886 }
887
888 // Verify the other operand is a VGPR, otherwise we would violate the
889 // constant bus restriction.
890 MachineOperand &OtherOp = MI->getOperand(OpNo);
891 if (!OtherOp.isReg() ||
892 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
893 return false;
894
895 assert(MI->getOperand(1).isDef());
896
897 // Make sure to get the 32-bit version of the commuted opcode.
898 unsigned MaybeCommutedOpc = MI->getOpcode();
899 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
900 }
901
902 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
903 Op32);
904 return true;
905 }
906
907 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
908 // By changing into fmamk we can untie Src2.
909 // If folding for Src0 happens first and it is identical operand to Src1 we
910 // should avoid transforming into fmamk which requires commuting as it would
911 // cause folding into Src1 to fail later on due to wrong OpNo used.
912 if (Opc == AMDGPU::S_FMAC_F32 &&
913 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
914 if (tryToFoldAsFMAAKorMK())
915 return true;
916 }
917
918 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
919 return true;
920}
921
922bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
923 const MachineOperand &UseMO) const {
924 // Operands of SDWA instructions must be registers.
925 return !TII->isSDWA(MI);
926}
927
930 Register SrcReg) {
931 MachineOperand *Sub = nullptr;
932 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
933 SubDef && TII.isFoldableCopy(*SubDef);
934 SubDef = MRI.getVRegDef(Sub->getReg())) {
935 unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
936 MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
937
938 if (SrcOp.isImm())
939 return &SrcOp;
940 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
941 break;
942 Sub = &SrcOp;
943 // TODO: Support compose
944 if (SrcOp.getSubReg())
945 break;
946 }
947
948 return Sub;
949}
950
951const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
952 MachineInstr &RegSeq,
953 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
954
955 assert(RegSeq.isRegSequence());
956
957 const TargetRegisterClass *RC = nullptr;
958
959 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
960 MachineOperand &SrcOp = RegSeq.getOperand(I);
961 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
962
963 // Only accept reg_sequence with uniform reg class inputs for simplicity.
964 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
965 if (!RC)
966 RC = OpRC;
967 else if (!TRI->getCommonSubClass(RC, OpRC))
968 return nullptr;
969
970 if (SrcOp.getSubReg()) {
971 // TODO: Handle subregister compose
972 Defs.emplace_back(&SrcOp, SubRegIdx);
973 continue;
974 }
975
976 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
977 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
978 Defs.emplace_back(DefSrc, SubRegIdx);
979 continue;
980 }
981
982 Defs.emplace_back(&SrcOp, SubRegIdx);
983 }
984
985 return RC;
986}
987
988// Find a def of the UseReg, check if it is a reg_sequence and find initializers
989// for each subreg, tracking it to an immediate if possible. Returns the
990// register class of the inputs on success.
991const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
992 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
993 Register UseReg) const {
994 MachineInstr *Def = MRI->getVRegDef(UseReg);
995 if (!Def || !Def->isRegSequence())
996 return nullptr;
997
998 return getRegSeqInit(*Def, Defs);
999}
1000
1001std::pair<int64_t, const TargetRegisterClass *>
1002SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1004 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1005 if (!SrcRC)
1006 return {};
1007
1008 bool TryToMatchSplat64 = false;
1009
1010 int64_t Imm;
1011 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1012 const MachineOperand *Op = Defs[I].first;
1013 if (!Op->isImm())
1014 return {};
1015
1016 int64_t SubImm = Op->getImm();
1017 if (!I) {
1018 Imm = SubImm;
1019 continue;
1020 }
1021
1022 if (Imm != SubImm) {
1023 if (I == 1 && (E & 1) == 0) {
1024 // If we have an even number of inputs, there's a chance this is a
1025 // 64-bit element splat broken into 32-bit pieces.
1026 TryToMatchSplat64 = true;
1027 break;
1028 }
1029
1030 return {}; // Can only fold splat constants
1031 }
1032 }
1033
1034 if (!TryToMatchSplat64)
1035 return {Defs[0].first->getImm(), SrcRC};
1036
1037 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1038 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1039 int64_t SplatVal64;
1040 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1041 const MachineOperand *Op0 = Defs[I].first;
1042 const MachineOperand *Op1 = Defs[I + 1].first;
1043
1044 if (!Op0->isImm() || !Op1->isImm())
1045 return {};
1046
1047 unsigned SubReg0 = Defs[I].second;
1048 unsigned SubReg1 = Defs[I + 1].second;
1049
1050 // Assume we're going to generally encounter reg_sequences with sorted
1051 // subreg indexes, so reject any that aren't consecutive.
1052 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1053 TRI->getChannelFromSubReg(SubReg1))
1054 return {};
1055
1056 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1057 if (I == 0)
1058 SplatVal64 = MergedVal;
1059 else if (SplatVal64 != MergedVal)
1060 return {};
1061 }
1062
1063 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1064 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1065
1066 return {SplatVal64, RC64};
1067}
1068
1069bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1070 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1071 const TargetRegisterClass *SplatRC) const {
1072 const MCInstrDesc &Desc = UseMI->getDesc();
1073 if (UseOpIdx >= Desc.getNumOperands())
1074 return false;
1075
1076 // Filter out unhandled pseudos.
1077 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1078 return false;
1079
1080 int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]);
1081 if (RCID == -1)
1082 return false;
1083
1084 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1085
1086 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1087 // have the same bits. These are the only cases where a splat has the same
1088 // interpretation for 32-bit and 64-bit splats.
1089 if (SplatVal != 0 && SplatVal != -1) {
1090 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1091 // operand will be AReg_128, and we want to check if it's compatible with an
1092 // AReg_32 constant.
1093 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1094 switch (OpTy) {
1099 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1100 break;
1104 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1105 break;
1106 default:
1107 return false;
1108 }
1109
1110 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1111 return false;
1112 }
1113
1114 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1115 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1116 return false;
1117
1118 return true;
1119}
1120
1121bool SIFoldOperandsImpl::tryToFoldACImm(
1122 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1123 SmallVectorImpl<FoldCandidate> &FoldList) const {
1124 const MCInstrDesc &Desc = UseMI->getDesc();
1125 if (UseOpIdx >= Desc.getNumOperands())
1126 return false;
1127
1128 // Filter out unhandled pseudos.
1129 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1130 return false;
1131
1132 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
1133 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1134 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1135 return true;
1136 }
1137
1138 // TODO: Verify the following code handles subregisters correctly.
1139 // TODO: Handle extract of global reference
1140 if (UseOp.getSubReg())
1141 return false;
1142
1143 if (!OpToFold.isReg())
1144 return false;
1145
1146 Register UseReg = OpToFold.getReg();
1147 if (!UseReg.isVirtual())
1148 return false;
1149
1150 // Maybe it is just a COPY of an immediate itself.
1151
1152 // FIXME: Remove this handling. There is already special case folding of
1153 // immediate into copy in foldOperand. This is looking for the def of the
1154 // value the folding started from in the first place.
1155 MachineInstr *Def = MRI->getVRegDef(UseReg);
1156 if (Def && TII->isFoldableCopy(*Def)) {
1157 MachineOperand &DefOp = Def->getOperand(1);
1158 if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
1159 FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
1160 OpToFold.DefSubReg);
1161 appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
1162 return true;
1163 }
1164 }
1165
1166 return false;
1167}
1168
1169void SIFoldOperandsImpl::foldOperand(
1170 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1171 SmallVectorImpl<FoldCandidate> &FoldList,
1172 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1173 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1174
1175 if (!isUseSafeToFold(*UseMI, *UseOp))
1176 return;
1177
1178 // FIXME: Fold operands with subregs.
1179 if (UseOp->isReg() && OpToFold.isReg()) {
1180 if (UseOp->isImplicit())
1181 return;
1182 // Allow folding from SGPRs to 16-bit VGPRs.
1183 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1184 (UseOp->getSubReg() != AMDGPU::lo16 ||
1185 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1186 return;
1187 }
1188
1189 // Special case for REG_SEQUENCE: We can't fold literals into
1190 // REG_SEQUENCE instructions, so we have to fold them into the
1191 // uses of REG_SEQUENCE.
1192 if (UseMI->isRegSequence()) {
1193 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1194 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1195
1196 int64_t SplatVal;
1197 const TargetRegisterClass *SplatRC;
1198 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1199
1200 // Grab the use operands first
1202 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1203 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1204 MachineOperand *RSUse = UsesToProcess[I];
1205 MachineInstr *RSUseMI = RSUse->getParent();
1206 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1207
1208 if (SplatRC) {
1209 if (RSUseMI->isCopy()) {
1210 Register DstReg = RSUseMI->getOperand(0).getReg();
1211 append_range(UsesToProcess,
1212 make_pointer_range(MRI->use_nodbg_operands(DstReg)));
1213 continue;
1214 }
1215 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1216 FoldableDef SplatDef(SplatVal, SplatRC);
1217 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1218 continue;
1219 }
1220 }
1221
1222 // TODO: Handle general compose
1223 if (RSUse->getSubReg() != RegSeqDstSubReg)
1224 continue;
1225
1226 // FIXME: We should avoid recursing here. There should be a cleaner split
1227 // between the in-place mutations and adding to the fold list.
1228 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1229 CopiesToReplace);
1230 }
1231
1232 return;
1233 }
1234
1235 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1236 return;
1237
1238 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1239 // Verify that this is a stack access.
1240 // FIXME: Should probably use stack pseudos before frame lowering.
1241
1242 if (TII->isMUBUF(*UseMI)) {
1243 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1244 MFI->getScratchRSrcReg())
1245 return;
1246
1247 // Ensure this is either relative to the current frame or the current
1248 // wave.
1249 MachineOperand &SOff =
1250 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1251 if (!SOff.isImm() || SOff.getImm() != 0)
1252 return;
1253 }
1254
1255 const unsigned Opc = UseMI->getOpcode();
1256 if (TII->isFLATScratch(*UseMI) &&
1257 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1258 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1259 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1260 unsigned CPol =
1261 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1262 if ((CPol & AMDGPU::CPol::SCAL) &&
1264 return;
1265
1266 UseMI->setDesc(TII->get(NewOpc));
1267 }
1268
1269 // A frame index will resolve to a positive constant, so it should always be
1270 // safe to fold the addressing mode, even pre-GFX9.
1271 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1272
1273 return;
1274 }
1275
1276 bool FoldingImmLike =
1277 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1278
1279 if (FoldingImmLike && UseMI->isCopy()) {
1280 Register DestReg = UseMI->getOperand(0).getReg();
1281 Register SrcReg = UseMI->getOperand(1).getReg();
1282 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1283 assert(SrcReg.isVirtual());
1284
1285 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1286
1287 // Don't fold into a copy to a physical register with the same class. Doing
1288 // so would interfere with the register coalescer's logic which would avoid
1289 // redundant initializations.
1290 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1291 return;
1292
1293 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1294 // In order to fold immediates into copies, we need to change the copy to a
1295 // MOV. Find a compatible mov instruction with the value.
1296 for (unsigned MovOp :
1297 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1298 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1299 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1300 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1301 const MCInstrDesc &MovDesc = TII->get(MovOp);
1302 const TargetRegisterClass *MovDstRC =
1303 TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0]));
1304
1305 // Fold if the destination register class of the MOV instruction (ResRC)
1306 // is a superclass of (or equal to) the destination register class of the
1307 // COPY (DestRC). If this condition fails, folding would be illegal.
1308 if (!DestRC->hasSuperClassEq(MovDstRC))
1309 continue;
1310
1311 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1312 const TargetRegisterClass *MovSrcRC =
1313 TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx]));
1314
1315 if (MovSrcRC) {
1316 if (UseSubReg)
1317 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1318
1319 // FIXME: We should be able to directly check immediate operand legality
1320 // for all cases, but gfx908 hacks break.
1321 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1322 (!OpToFold.isImm() ||
1323 !TII->isImmOperandLegal(MovDesc, SrcIdx,
1324 *OpToFold.getEffectiveImmVal())))
1325 break;
1326
1327 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1328 break;
1329
1330 // FIXME: This is mutating the instruction only and deferring the actual
1331 // fold of the immediate
1332 } else {
1333 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1334 // immediate to verify. Technically we should always verify this, but it
1335 // only matters for these concrete cases.
1336 // TODO: Handle non-imm case if it's useful.
1337 if (!OpToFold.isImm() ||
1338 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1339 break;
1340 }
1341
1344 while (ImpOpI != ImpOpE) {
1345 MachineInstr::mop_iterator Tmp = ImpOpI;
1346 ImpOpI++;
1348 }
1349 UseMI->setDesc(MovDesc);
1350
1351 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1352 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1353 MachineOperand NewSrcOp(SrcOp);
1354 MachineFunction *MF = UseMI->getParent()->getParent();
1355 UseMI->removeOperand(1);
1356 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1357 UseMI->addOperand(NewSrcOp); // src0
1358 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1359 UseOpIdx = SrcIdx;
1360 UseOp = &UseMI->getOperand(UseOpIdx);
1361 }
1362 CopiesToReplace.push_back(UseMI);
1363 break;
1364 }
1365
1366 // We failed to replace the copy, so give up.
1367 if (UseMI->getOpcode() == AMDGPU::COPY)
1368 return;
1369
1370 } else {
1371 if (UseMI->isCopy() && OpToFold.isReg() &&
1372 UseMI->getOperand(0).getReg().isVirtual() &&
1373 !UseMI->getOperand(1).getSubReg() &&
1374 OpToFold.DefMI->implicit_operands().empty()) {
1375 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1376 << *UseMI);
1377 unsigned Size = TII->getOpSize(*UseMI, 1);
1378 Register UseReg = OpToFold.getReg();
1380 unsigned SubRegIdx = OpToFold.getSubReg();
1381 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1382 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1383 // VS_16RegClass
1384 //
1385 // Excerpt from AMDGPUGenRegisterInfo.inc
1386 // NoSubRegister, //0
1387 // hi16, // 1
1388 // lo16, // 2
1389 // sub0, // 3
1390 // ...
1391 // sub1, // 11
1392 // sub1_hi16, // 12
1393 // sub1_lo16, // 13
1394 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1395 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1396 TRI->isSGPRReg(*MRI, UseReg)) {
1397 // Produce the 32 bit subregister index to which the 16-bit subregister
1398 // is aligned.
1399 if (SubRegIdx > AMDGPU::sub1) {
1400 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1401 M |= M.getLane(M.getHighestLane() - 1);
1402 SmallVector<unsigned, 4> Indexes;
1403 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1404 Indexes);
1405 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1406 SubRegIdx = Indexes[0];
1407 // 32-bit registers do not have a sub0 index
1408 } else if (TII->getOpSize(*UseMI, 1) == 4)
1409 SubRegIdx = 0;
1410 else
1411 SubRegIdx = AMDGPU::sub0;
1412 }
1413 UseMI->getOperand(1).setSubReg(SubRegIdx);
1414 UseMI->getOperand(1).setIsKill(false);
1415 CopiesToReplace.push_back(UseMI);
1416 OpToFold.OpToFold->setIsKill(false);
1417
1418 // Remove kill flags as kills may now be out of order with uses.
1419 MRI->clearKillFlags(UseReg);
1420 if (foldCopyToAGPRRegSequence(UseMI))
1421 return;
1422 }
1423
1424 unsigned UseOpc = UseMI->getOpcode();
1425 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1426 (UseOpc == AMDGPU::V_READLANE_B32 &&
1427 (int)UseOpIdx ==
1428 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1429 // %vgpr = V_MOV_B32 imm
1430 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1431 // =>
1432 // %sgpr = S_MOV_B32 imm
1433 if (FoldingImmLike) {
1435 UseMI->getOperand(UseOpIdx).getReg(),
1436 *OpToFold.DefMI, *UseMI))
1437 return;
1438
1439 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1440
1441 if (OpToFold.isImm()) {
1443 *OpToFold.getEffectiveImmVal());
1444 } else if (OpToFold.isFI())
1445 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1446 else {
1447 assert(OpToFold.isGlobal());
1448 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1449 OpToFold.OpToFold->getOffset(),
1450 OpToFold.OpToFold->getTargetFlags());
1451 }
1452 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1453 return;
1454 }
1455
1456 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1458 UseMI->getOperand(UseOpIdx).getReg(),
1459 *OpToFold.DefMI, *UseMI))
1460 return;
1461
1462 // %vgpr = COPY %sgpr0
1463 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1464 // =>
1465 // %sgpr1 = COPY %sgpr0
1466 UseMI->setDesc(TII->get(AMDGPU::COPY));
1467 UseMI->getOperand(1).setReg(OpToFold.getReg());
1468 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1469 UseMI->getOperand(1).setIsKill(false);
1470 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1471 return;
1472 }
1473 }
1474
1475 const MCInstrDesc &UseDesc = UseMI->getDesc();
1476
1477 // Don't fold into target independent nodes. Target independent opcodes
1478 // don't have defined register classes.
1479 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1480 UseDesc.operands()[UseOpIdx].RegClass == -1)
1481 return;
1482 }
1483
1484 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1485 // to enable more folding opportunities. The shrink operands pass
1486 // already does this.
1487
1488 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1489}
1490
1491static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1493 switch (Opcode) {
1494 case AMDGPU::V_AND_B32_e64:
1495 case AMDGPU::V_AND_B32_e32:
1496 case AMDGPU::S_AND_B32:
1497 Result = LHS & RHS;
1498 return true;
1499 case AMDGPU::V_OR_B32_e64:
1500 case AMDGPU::V_OR_B32_e32:
1501 case AMDGPU::S_OR_B32:
1502 Result = LHS | RHS;
1503 return true;
1504 case AMDGPU::V_XOR_B32_e64:
1505 case AMDGPU::V_XOR_B32_e32:
1506 case AMDGPU::S_XOR_B32:
1507 Result = LHS ^ RHS;
1508 return true;
1509 case AMDGPU::S_XNOR_B32:
1510 Result = ~(LHS ^ RHS);
1511 return true;
1512 case AMDGPU::S_NAND_B32:
1513 Result = ~(LHS & RHS);
1514 return true;
1515 case AMDGPU::S_NOR_B32:
1516 Result = ~(LHS | RHS);
1517 return true;
1518 case AMDGPU::S_ANDN2_B32:
1519 Result = LHS & ~RHS;
1520 return true;
1521 case AMDGPU::S_ORN2_B32:
1522 Result = LHS | ~RHS;
1523 return true;
1524 case AMDGPU::V_LSHL_B32_e64:
1525 case AMDGPU::V_LSHL_B32_e32:
1526 case AMDGPU::S_LSHL_B32:
1527 // The instruction ignores the high bits for out of bounds shifts.
1528 Result = LHS << (RHS & 31);
1529 return true;
1530 case AMDGPU::V_LSHLREV_B32_e64:
1531 case AMDGPU::V_LSHLREV_B32_e32:
1532 Result = RHS << (LHS & 31);
1533 return true;
1534 case AMDGPU::V_LSHR_B32_e64:
1535 case AMDGPU::V_LSHR_B32_e32:
1536 case AMDGPU::S_LSHR_B32:
1537 Result = LHS >> (RHS & 31);
1538 return true;
1539 case AMDGPU::V_LSHRREV_B32_e64:
1540 case AMDGPU::V_LSHRREV_B32_e32:
1541 Result = RHS >> (LHS & 31);
1542 return true;
1543 case AMDGPU::V_ASHR_I32_e64:
1544 case AMDGPU::V_ASHR_I32_e32:
1545 case AMDGPU::S_ASHR_I32:
1546 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1547 return true;
1548 case AMDGPU::V_ASHRREV_I32_e64:
1549 case AMDGPU::V_ASHRREV_I32_e32:
1550 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1551 return true;
1552 default:
1553 return false;
1554 }
1555}
1556
1557static unsigned getMovOpc(bool IsScalar) {
1558 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1559}
1560
1561static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1562 MI.setDesc(NewDesc);
1563
1564 // Remove any leftover implicit operands from mutating the instruction. e.g.
1565 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1566 // anymore.
1567 const MCInstrDesc &Desc = MI.getDesc();
1568 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1569 Desc.implicit_defs().size();
1570
1571 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1572 MI.removeOperand(I);
1573}
1574
1575std::optional<int64_t>
1576SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
1577 if (Op.isImm())
1578 return Op.getImm();
1579
1580 if (!Op.isReg() || !Op.getReg().isVirtual())
1581 return std::nullopt;
1582
1583 const MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1584 if (Def && Def->isMoveImmediate()) {
1585 const MachineOperand &ImmSrc = Def->getOperand(1);
1586 if (ImmSrc.isImm())
1587 return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1588 }
1589
1590 return std::nullopt;
1591}
1592
1593// Try to simplify operations with a constant that may appear after instruction
1594// selection.
1595// TODO: See if a frame index with a fixed offset can fold.
1596bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1597 if (!MI->allImplicitDefsAreDead())
1598 return false;
1599
1600 unsigned Opc = MI->getOpcode();
1601
1602 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1603 if (Src0Idx == -1)
1604 return false;
1605
1606 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1607 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1608
1609 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1610 Opc == AMDGPU::S_NOT_B32) &&
1611 Src0Imm) {
1612 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1613 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1614 return true;
1615 }
1616
1617 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1618 if (Src1Idx == -1)
1619 return false;
1620
1621 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1622 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1623
1624 if (!Src0Imm && !Src1Imm)
1625 return false;
1626
1627 // and k0, k1 -> v_mov_b32 (k0 & k1)
1628 // or k0, k1 -> v_mov_b32 (k0 | k1)
1629 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1630 if (Src0Imm && Src1Imm) {
1631 int32_t NewImm;
1632 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1633 return false;
1634
1635 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1636
1637 // Be careful to change the right operand, src0 may belong to a different
1638 // instruction.
1639 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1640 MI->removeOperand(Src1Idx);
1641 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1642 return true;
1643 }
1644
1645 if (!MI->isCommutable())
1646 return false;
1647
1648 if (Src0Imm && !Src1Imm) {
1649 std::swap(Src0, Src1);
1650 std::swap(Src0Idx, Src1Idx);
1651 std::swap(Src0Imm, Src1Imm);
1652 }
1653
1654 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1655 if (Opc == AMDGPU::V_OR_B32_e64 ||
1656 Opc == AMDGPU::V_OR_B32_e32 ||
1657 Opc == AMDGPU::S_OR_B32) {
1658 if (Src1Val == 0) {
1659 // y = or x, 0 => y = copy x
1660 MI->removeOperand(Src1Idx);
1661 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1662 } else if (Src1Val == -1) {
1663 // y = or x, -1 => y = v_mov_b32 -1
1664 MI->removeOperand(Src1Idx);
1665 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1666 } else
1667 return false;
1668
1669 return true;
1670 }
1671
1672 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1673 Opc == AMDGPU::S_AND_B32) {
1674 if (Src1Val == 0) {
1675 // y = and x, 0 => y = v_mov_b32 0
1676 MI->removeOperand(Src0Idx);
1677 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1678 } else if (Src1Val == -1) {
1679 // y = and x, -1 => y = copy x
1680 MI->removeOperand(Src1Idx);
1681 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1682 } else
1683 return false;
1684
1685 return true;
1686 }
1687
1688 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1689 Opc == AMDGPU::S_XOR_B32) {
1690 if (Src1Val == 0) {
1691 // y = xor x, 0 => y = copy x
1692 MI->removeOperand(Src1Idx);
1693 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1694 return true;
1695 }
1696 }
1697
1698 return false;
1699}
1700
1701// Try to fold an instruction into a simpler one
1702bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1703 unsigned Opc = MI.getOpcode();
1704 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1705 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1706 return false;
1707
1708 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1709 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1710 if (!Src1->isIdenticalTo(*Src0)) {
1711 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1712 if (!Src1Imm)
1713 return false;
1714
1715 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1716 if (!Src0Imm || *Src0Imm != *Src1Imm)
1717 return false;
1718 }
1719
1720 int Src1ModIdx =
1721 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1722 int Src0ModIdx =
1723 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1724 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1725 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1726 return false;
1727
1728 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1729 auto &NewDesc =
1730 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1731 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1732 if (Src2Idx != -1)
1733 MI.removeOperand(Src2Idx);
1734 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1735 if (Src1ModIdx != -1)
1736 MI.removeOperand(Src1ModIdx);
1737 if (Src0ModIdx != -1)
1738 MI.removeOperand(Src0ModIdx);
1739 mutateCopyOp(MI, NewDesc);
1740 LLVM_DEBUG(dbgs() << MI);
1741 return true;
1742}
1743
1744bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1745 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1746 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1747 return false;
1748
1749 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
1750 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1751 return false;
1752
1753 Register Src1 = MI.getOperand(2).getReg();
1754 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1755 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1756 return false;
1757
1758 Register Dst = MI.getOperand(0).getReg();
1759 MRI->replaceRegWith(Dst, Src1);
1760 if (!MI.getOperand(2).isKill())
1761 MRI->clearKillFlags(Src1);
1762 MI.eraseFromParent();
1763 return true;
1764}
1765
1766bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1767 const FoldableDef &OpToFold) const {
1768 // We need mutate the operands of new mov instructions to add implicit
1769 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1770 // this.
1771 SmallVector<MachineInstr *, 4> CopiesToReplace;
1773 MachineOperand &Dst = MI.getOperand(0);
1774 bool Changed = false;
1775
1776 if (OpToFold.isImm()) {
1777 for (auto &UseMI :
1778 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1779 // Folding the immediate may reveal operations that can be constant
1780 // folded or replaced with a copy. This can happen for example after
1781 // frame indices are lowered to constants or from splitting 64-bit
1782 // constants.
1783 //
1784 // We may also encounter cases where one or both operands are
1785 // immediates materialized into a register, which would ordinarily not
1786 // be folded due to multiple uses or operand constraints.
1787 if (tryConstantFoldOp(&UseMI)) {
1788 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1789 Changed = true;
1790 }
1791 }
1792 }
1793
1795 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1796 for (auto *U : UsesToProcess) {
1797 MachineInstr *UseMI = U->getParent();
1798
1799 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1800 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1801 CopiesToReplace);
1802 }
1803
1804 if (CopiesToReplace.empty() && FoldList.empty())
1805 return Changed;
1806
1807 MachineFunction *MF = MI.getParent()->getParent();
1808 // Make sure we add EXEC uses to any new v_mov instructions created.
1809 for (MachineInstr *Copy : CopiesToReplace)
1810 Copy->addImplicitDefUseOperands(*MF);
1811
1812 SetVector<MachineInstr *> ConstantFoldCandidates;
1813 for (FoldCandidate &Fold : FoldList) {
1814 assert(!Fold.isReg() || Fold.Def.OpToFold);
1815 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1816 Register Reg = Fold.getReg();
1817 const MachineInstr *DefMI = Fold.Def.DefMI;
1818 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1819 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1820 continue;
1821 }
1822 if (updateOperand(Fold)) {
1823 // Clear kill flags.
1824 if (Fold.isReg()) {
1825 assert(Fold.Def.OpToFold && Fold.isReg());
1826 // FIXME: Probably shouldn't bother trying to fold if not an
1827 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1828 // copies.
1829 MRI->clearKillFlags(Fold.getReg());
1830 }
1831 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1832 << static_cast<int>(Fold.UseOpNo) << " of "
1833 << *Fold.UseMI);
1834
1835 if (Fold.isImm())
1836 ConstantFoldCandidates.insert(Fold.UseMI);
1837
1838 } else if (Fold.Commuted) {
1839 // Restoring instruction's original operand order if fold has failed.
1840 TII->commuteInstruction(*Fold.UseMI, false);
1841 }
1842 }
1843
1844 for (MachineInstr *MI : ConstantFoldCandidates) {
1845 if (tryConstantFoldOp(MI)) {
1846 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1847 Changed = true;
1848 }
1849 }
1850 return true;
1851}
1852
1853/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1854/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1855bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1856 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1857 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1858 // initializers right here, so we will rematerialize immediates and avoid
1859 // copies via different reg classes.
1860 const TargetRegisterClass *DefRC =
1861 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1862 if (!TRI->isAGPRClass(DefRC))
1863 return false;
1864
1865 Register UseReg = CopyMI->getOperand(1).getReg();
1866 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1867 if (!RegSeq || !RegSeq->isRegSequence())
1868 return false;
1869
1870 const DebugLoc &DL = CopyMI->getDebugLoc();
1871 MachineBasicBlock &MBB = *CopyMI->getParent();
1872
1873 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1874 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1875
1876 const TargetRegisterClass *UseRC =
1877 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1878
1879 // Value, subregindex for new REG_SEQUENCE
1881
1882 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1883 unsigned NumFoldable = 0;
1884
1885 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1886 MachineOperand &RegOp = RegSeq->getOperand(I);
1887 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1888
1889 if (RegOp.getSubReg()) {
1890 // TODO: Handle subregister compose
1891 NewDefs.emplace_back(&RegOp, SubRegIdx);
1892 continue;
1893 }
1894
1895 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1896 if (!Lookup)
1897 Lookup = &RegOp;
1898
1899 if (Lookup->isImm()) {
1900 // Check if this is an agpr_32 subregister.
1901 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1902 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1903 if (DestSuperRC &&
1904 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1905 ++NumFoldable;
1906 NewDefs.emplace_back(Lookup, SubRegIdx);
1907 continue;
1908 }
1909 }
1910
1911 const TargetRegisterClass *InputRC =
1912 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1913 : MRI->getRegClass(RegOp.getReg());
1914
1915 // TODO: Account for Lookup->getSubReg()
1916
1917 // If we can't find a matching super class, this is an SGPR->AGPR or
1918 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1919 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1920 // want to rewrite to copy to an intermediate VGPR class.
1921 const TargetRegisterClass *MatchRC =
1922 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1923 if (!MatchRC) {
1924 ++NumFoldable;
1925 NewDefs.emplace_back(&RegOp, SubRegIdx);
1926 continue;
1927 }
1928
1929 NewDefs.emplace_back(&RegOp, SubRegIdx);
1930 }
1931
1932 // Do not clone a reg_sequence and merely change the result register class.
1933 if (NumFoldable == 0)
1934 return false;
1935
1936 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1937 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1938 CopyMI->removeOperand(I);
1939
1940 for (auto [Def, DestSubIdx] : NewDefs) {
1941 if (!Def->isReg()) {
1942 // TODO: Should we use single write for each repeated value like in
1943 // register case?
1944 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1945 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1946 .add(*Def);
1947 B.addReg(Tmp);
1948 } else {
1949 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1950 Def->setIsKill(false);
1951
1952 Register &VGPRCopy = VGPRCopies[Src];
1953 if (!VGPRCopy) {
1954 const TargetRegisterClass *VGPRUseSubRC =
1955 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1956
1957 // We cannot build a reg_sequence out of the same registers, they
1958 // must be copied. Better do it here before copyPhysReg() created
1959 // several reads to do the AGPR->VGPR->AGPR copy.
1960
1961 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1962 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1963 // later, create a copy here and track if we already have such a copy.
1964 const TargetRegisterClass *SubRC =
1965 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1966 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1967 // TODO: Try to reconstrain class
1968 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1969 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1970 B.addReg(VGPRCopy);
1971 } else {
1972 // If it is already a VGPR, do not copy the register.
1973 B.add(*Def);
1974 }
1975 } else {
1976 B.addReg(VGPRCopy);
1977 }
1978 }
1979
1980 B.addImm(DestSubIdx);
1981 }
1982
1983 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
1984 return true;
1985}
1986
1987bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1988 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1989 Register DstReg = MI.getOperand(0).getReg();
1990 // Specially track simple redefs of m0 to the same value in a block, so we
1991 // can erase the later ones.
1992 if (DstReg == AMDGPU::M0) {
1993 MachineOperand &NewM0Val = MI.getOperand(1);
1994 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1995 MI.eraseFromParent();
1996 return true;
1997 }
1998
1999 // We aren't tracking other physical registers
2000 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
2001 ? nullptr
2002 : &NewM0Val;
2003 return false;
2004 }
2005
2006 MachineOperand *OpToFoldPtr;
2007 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
2008 // Folding when any src_modifiers are non-zero is unsupported
2009 if (TII->hasAnyModifiersSet(MI))
2010 return false;
2011 OpToFoldPtr = &MI.getOperand(2);
2012 } else
2013 OpToFoldPtr = &MI.getOperand(1);
2014 MachineOperand &OpToFold = *OpToFoldPtr;
2015 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
2016
2017 // FIXME: We could also be folding things like TargetIndexes.
2018 if (!FoldingImm && !OpToFold.isReg())
2019 return false;
2020
2021 // Fold virtual registers and constant physical registers.
2022 if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
2023 !TRI->isConstantPhysReg(OpToFold.getReg()))
2024 return false;
2025
2026 // Prevent folding operands backwards in the function. For example,
2027 // the COPY opcode must not be replaced by 1 in this example:
2028 //
2029 // %3 = COPY %vgpr0; VGPR_32:%3
2030 // ...
2031 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2032 if (!DstReg.isVirtual())
2033 return false;
2034
2035 const TargetRegisterClass *DstRC =
2036 MRI->getRegClass(MI.getOperand(0).getReg());
2037
2038 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2039 // Can remove this code if proper 16-bit SGPRs are implemented
2040 // Example: Pre-peephole-opt
2041 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2042 // %32:sreg_32 = COPY %29:sgpr_lo16
2043 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2044 // Post-peephole-opt and DCE
2045 // %32:sreg_32 = COPY %16.lo16:sreg_32
2046 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2047 // After this transform
2048 // %32:sreg_32 = COPY %16:sreg_32
2049 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2050 // After the fold operands pass
2051 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2052 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2053 OpToFold.getSubReg()) {
2054 if (DstRC == &AMDGPU::SReg_32RegClass &&
2055 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2056 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2057 OpToFold.setSubReg(0);
2058 }
2059 }
2060
2061 // Fold copy to AGPR through reg_sequence
2062 // TODO: Handle with subregister extract
2063 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2064 if (foldCopyToAGPRRegSequence(&MI))
2065 return true;
2066 }
2067
2068 FoldableDef Def(OpToFold, DstRC);
2069 bool Changed = foldInstOperand(MI, Def);
2070
2071 // If we managed to fold all uses of this copy then we might as well
2072 // delete it now.
2073 // The only reason we need to follow chains of copies here is that
2074 // tryFoldRegSequence looks forward through copies before folding a
2075 // REG_SEQUENCE into its eventual users.
2076 auto *InstToErase = &MI;
2077 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2078 auto &SrcOp = InstToErase->getOperand(1);
2079 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2080 InstToErase->eraseFromParent();
2081 Changed = true;
2082 InstToErase = nullptr;
2083 if (!SrcReg || SrcReg.isPhysical())
2084 break;
2085 InstToErase = MRI->getVRegDef(SrcReg);
2086 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2087 break;
2088 }
2089
2090 if (InstToErase && InstToErase->isRegSequence() &&
2091 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2092 InstToErase->eraseFromParent();
2093 Changed = true;
2094 }
2095
2096 if (Changed)
2097 return true;
2098
2099 // Run this after foldInstOperand to avoid turning scalar additions into
2100 // vector additions when the result scalar result could just be folded into
2101 // the user(s).
2102 return OpToFold.isReg() &&
2103 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2104}
2105
2106// Clamp patterns are canonically selected to v_max_* instructions, so only
2107// handle them.
2108const MachineOperand *
2109SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2110 unsigned Op = MI.getOpcode();
2111 switch (Op) {
2112 case AMDGPU::V_MAX_F32_e64:
2113 case AMDGPU::V_MAX_F16_e64:
2114 case AMDGPU::V_MAX_F16_t16_e64:
2115 case AMDGPU::V_MAX_F16_fake16_e64:
2116 case AMDGPU::V_MAX_F64_e64:
2117 case AMDGPU::V_MAX_NUM_F64_e64:
2118 case AMDGPU::V_PK_MAX_F16:
2119 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2120 case AMDGPU::V_PK_MAX_NUM_BF16: {
2121 if (MI.mayRaiseFPException())
2122 return nullptr;
2123
2124 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2125 return nullptr;
2126
2127 // Make sure sources are identical.
2128 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2129 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2130 if (!Src0->isReg() || !Src1->isReg() ||
2131 Src0->getReg() != Src1->getReg() ||
2132 Src0->getSubReg() != Src1->getSubReg() ||
2133 Src0->getSubReg() != AMDGPU::NoSubRegister)
2134 return nullptr;
2135
2136 // Can't fold up if we have modifiers.
2137 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2138 return nullptr;
2139
2140 unsigned Src0Mods
2141 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2142 unsigned Src1Mods
2143 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2144
2145 // Having a 0 op_sel_hi would require swizzling the output in the source
2146 // instruction, which we can't do.
2147 unsigned UnsetMods =
2148 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2150 : 0u;
2151 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2152 return nullptr;
2153 return Src0;
2154 }
2155 default:
2156 return nullptr;
2157 }
2158}
2159
2160// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2161bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2162 const MachineOperand *ClampSrc = isClamp(MI);
2163 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2164 return false;
2165
2166 if (!ClampSrc->getReg().isVirtual())
2167 return false;
2168
2169 // Look through COPY. COPY only observed with True16.
2170 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2171 MachineInstr *Def =
2172 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2173
2174 // The type of clamp must be compatible.
2175 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
2176 return false;
2177
2178 if (Def->mayRaiseFPException())
2179 return false;
2180
2181 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2182 if (!DefClamp)
2183 return false;
2184
2185 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2186
2187 // Clamp is applied after omod, so it is OK if omod is set.
2188 DefClamp->setImm(1);
2189
2190 Register DefReg = Def->getOperand(0).getReg();
2191 Register MIDstReg = MI.getOperand(0).getReg();
2192 if (TRI->isSGPRReg(*MRI, DefReg)) {
2193 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2194 // instruction with a VGPR dst.
2195 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2196 MIDstReg)
2197 .addReg(DefReg);
2198 } else {
2199 MRI->replaceRegWith(MIDstReg, DefReg);
2200 }
2201 MI.eraseFromParent();
2202
2203 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2204 // instruction, so we might as well convert it to the more flexible VOP3-only
2205 // mad/fma form.
2206 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2207 Def->eraseFromParent();
2208
2209 return true;
2210}
2211
2212static int getOModValue(unsigned Opc, int64_t Val) {
2213 switch (Opc) {
2214 case AMDGPU::V_MUL_F64_e64:
2215 case AMDGPU::V_MUL_F64_pseudo_e64: {
2216 switch (Val) {
2217 case 0x3fe0000000000000: // 0.5
2218 return SIOutMods::DIV2;
2219 case 0x4000000000000000: // 2.0
2220 return SIOutMods::MUL2;
2221 case 0x4010000000000000: // 4.0
2222 return SIOutMods::MUL4;
2223 default:
2224 return SIOutMods::NONE;
2225 }
2226 }
2227 case AMDGPU::V_MUL_F32_e64: {
2228 switch (static_cast<uint32_t>(Val)) {
2229 case 0x3f000000: // 0.5
2230 return SIOutMods::DIV2;
2231 case 0x40000000: // 2.0
2232 return SIOutMods::MUL2;
2233 case 0x40800000: // 4.0
2234 return SIOutMods::MUL4;
2235 default:
2236 return SIOutMods::NONE;
2237 }
2238 }
2239 case AMDGPU::V_MUL_F16_e64:
2240 case AMDGPU::V_MUL_F16_t16_e64:
2241 case AMDGPU::V_MUL_F16_fake16_e64: {
2242 switch (static_cast<uint16_t>(Val)) {
2243 case 0x3800: // 0.5
2244 return SIOutMods::DIV2;
2245 case 0x4000: // 2.0
2246 return SIOutMods::MUL2;
2247 case 0x4400: // 4.0
2248 return SIOutMods::MUL4;
2249 default:
2250 return SIOutMods::NONE;
2251 }
2252 }
2253 default:
2254 llvm_unreachable("invalid mul opcode");
2255 }
2256}
2257
2258// FIXME: Does this really not support denormals with f16?
2259// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2260// handled, so will anything other than that break?
2261std::pair<const MachineOperand *, int>
2262SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2263 unsigned Op = MI.getOpcode();
2264 switch (Op) {
2265 case AMDGPU::V_MUL_F64_e64:
2266 case AMDGPU::V_MUL_F64_pseudo_e64:
2267 case AMDGPU::V_MUL_F32_e64:
2268 case AMDGPU::V_MUL_F16_t16_e64:
2269 case AMDGPU::V_MUL_F16_fake16_e64:
2270 case AMDGPU::V_MUL_F16_e64: {
2271 // If output denormals are enabled, omod is ignored.
2272 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2274 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2275 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2276 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2279 MI.mayRaiseFPException())
2280 return std::pair(nullptr, SIOutMods::NONE);
2281
2282 const MachineOperand *RegOp = nullptr;
2283 const MachineOperand *ImmOp = nullptr;
2284 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2285 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2286 if (Src0->isImm()) {
2287 ImmOp = Src0;
2288 RegOp = Src1;
2289 } else if (Src1->isImm()) {
2290 ImmOp = Src1;
2291 RegOp = Src0;
2292 } else
2293 return std::pair(nullptr, SIOutMods::NONE);
2294
2295 int OMod = getOModValue(Op, ImmOp->getImm());
2296 if (OMod == SIOutMods::NONE ||
2297 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2298 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2299 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2300 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2301 return std::pair(nullptr, SIOutMods::NONE);
2302
2303 return std::pair(RegOp, OMod);
2304 }
2305 case AMDGPU::V_ADD_F64_e64:
2306 case AMDGPU::V_ADD_F64_pseudo_e64:
2307 case AMDGPU::V_ADD_F32_e64:
2308 case AMDGPU::V_ADD_F16_e64:
2309 case AMDGPU::V_ADD_F16_t16_e64:
2310 case AMDGPU::V_ADD_F16_fake16_e64: {
2311 // If output denormals are enabled, omod is ignored.
2312 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2314 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2315 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2316 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2318 return std::pair(nullptr, SIOutMods::NONE);
2319
2320 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2321 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2322 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2323
2324 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2325 Src0->getSubReg() == Src1->getSubReg() &&
2326 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2327 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2328 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2329 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2330 return std::pair(Src0, SIOutMods::MUL2);
2331
2332 return std::pair(nullptr, SIOutMods::NONE);
2333 }
2334 default:
2335 return std::pair(nullptr, SIOutMods::NONE);
2336 }
2337}
2338
2339// FIXME: Does this need to check IEEE bit on function?
2340bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2341 const MachineOperand *RegOp;
2342 int OMod;
2343 std::tie(RegOp, OMod) = isOMod(MI);
2344 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2345 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2346 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2347 return false;
2348
2349 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2350 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2351 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2352 return false;
2353
2354 if (Def->mayRaiseFPException())
2355 return false;
2356
2357 // Clamp is applied after omod. If the source already has clamp set, don't
2358 // fold it.
2359 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2360 return false;
2361
2362 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2363
2364 DefOMod->setImm(OMod);
2365 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2366 // Kill flags can be wrong if we replaced a def inside a loop with a def
2367 // outside the loop.
2368 MRI->clearKillFlags(Def->getOperand(0).getReg());
2369 MI.eraseFromParent();
2370
2371 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2372 // instruction, so we might as well convert it to the more flexible VOP3-only
2373 // mad/fma form.
2374 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2375 Def->eraseFromParent();
2376
2377 return true;
2378}
2379
2380// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2381// instruction which can take an agpr. So far that means a store.
2382bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2383 assert(MI.isRegSequence());
2384 auto Reg = MI.getOperand(0).getReg();
2385
2386 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2387 !MRI->hasOneNonDBGUse(Reg))
2388 return false;
2389
2391 if (!getRegSeqInit(Defs, Reg))
2392 return false;
2393
2394 for (auto &[Op, SubIdx] : Defs) {
2395 if (!Op->isReg())
2396 return false;
2397 if (TRI->isAGPR(*MRI, Op->getReg()))
2398 continue;
2399 // Maybe this is a COPY from AREG
2400 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2401 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2402 return false;
2403 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2404 return false;
2405 }
2406
2407 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2408 MachineInstr *UseMI = Op->getParent();
2409 while (UseMI->isCopy() && !Op->getSubReg()) {
2410 Reg = UseMI->getOperand(0).getReg();
2411 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2412 return false;
2413 Op = &*MRI->use_nodbg_begin(Reg);
2414 UseMI = Op->getParent();
2415 }
2416
2417 if (Op->getSubReg())
2418 return false;
2419
2420 unsigned OpIdx = Op - &UseMI->getOperand(0);
2421 const MCInstrDesc &InstDesc = UseMI->getDesc();
2422 const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI);
2423 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2424 return false;
2425
2426 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2427 auto Dst = MRI->createVirtualRegister(NewDstRC);
2428 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2429 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2430
2431 for (auto &[Def, SubIdx] : Defs) {
2432 Def->setIsKill(false);
2433 if (TRI->isAGPR(*MRI, Def->getReg())) {
2434 RS.add(*Def);
2435 } else { // This is a copy
2436 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2437 SubDef->getOperand(1).setIsKill(false);
2438 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
2439 }
2440 RS.addImm(SubIdx);
2441 }
2442
2443 Op->setReg(Dst);
2444 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2445 Op->setReg(Reg);
2446 RS->eraseFromParent();
2447 return false;
2448 }
2449
2450 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2451
2452 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2453 // in which case we can erase them all later in runOnMachineFunction.
2454 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2455 MI.eraseFromParent();
2456 return true;
2457}
2458
2459/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2460/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2461static bool isAGPRCopy(const SIRegisterInfo &TRI,
2462 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2463 Register &OutReg, unsigned &OutSubReg) {
2464 assert(Copy.isCopy());
2465
2466 const MachineOperand &CopySrc = Copy.getOperand(1);
2467 Register CopySrcReg = CopySrc.getReg();
2468 if (!CopySrcReg.isVirtual())
2469 return false;
2470
2471 // Common case: copy from AGPR directly, e.g.
2472 // %1:vgpr_32 = COPY %0:agpr_32
2473 if (TRI.isAGPR(MRI, CopySrcReg)) {
2474 OutReg = CopySrcReg;
2475 OutSubReg = CopySrc.getSubReg();
2476 return true;
2477 }
2478
2479 // Sometimes it can also involve two copies, e.g.
2480 // %1:vgpr_256 = COPY %0:agpr_256
2481 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2482 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2483 if (!CopySrcDef || !CopySrcDef->isCopy())
2484 return false;
2485
2486 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2487 Register OtherCopySrcReg = OtherCopySrc.getReg();
2488 if (!OtherCopySrcReg.isVirtual() ||
2489 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2490 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2491 !TRI.isAGPR(MRI, OtherCopySrcReg))
2492 return false;
2493
2494 OutReg = OtherCopySrcReg;
2495 OutSubReg = CopySrc.getSubReg();
2496 return true;
2497}
2498
2499// Try to hoist an AGPR to VGPR copy across a PHI.
2500// This should allow folding of an AGPR into a consumer which may support it.
2501//
2502// Example 1: LCSSA PHI
2503// loop:
2504// %1:vreg = COPY %0:areg
2505// exit:
2506// %2:vreg = PHI %1:vreg, %loop
2507// =>
2508// loop:
2509// exit:
2510// %1:areg = PHI %0:areg, %loop
2511// %2:vreg = COPY %1:areg
2512//
2513// Example 2: PHI with multiple incoming values:
2514// entry:
2515// %1:vreg = GLOBAL_LOAD(..)
2516// loop:
2517// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2518// %3:areg = COPY %2:vreg
2519// %4:areg = (instr using %3:areg)
2520// %5:vreg = COPY %4:areg
2521// =>
2522// entry:
2523// %1:vreg = GLOBAL_LOAD(..)
2524// %2:areg = COPY %1:vreg
2525// loop:
2526// %3:areg = PHI %2:areg, %entry, %X:areg,
2527// %4:areg = (instr using %3:areg)
2528bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2529 assert(PHI.isPHI());
2530
2531 Register PhiOut = PHI.getOperand(0).getReg();
2532 if (!TRI->isVGPR(*MRI, PhiOut))
2533 return false;
2534
2535 // Iterate once over all incoming values of the PHI to check if this PHI is
2536 // eligible, and determine the exact AGPR RC we'll target.
2537 const TargetRegisterClass *ARC = nullptr;
2538 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2539 MachineOperand &MO = PHI.getOperand(K);
2540 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2541 if (!Copy || !Copy->isCopy())
2542 continue;
2543
2544 Register AGPRSrc;
2545 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2546 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2547 continue;
2548
2549 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2550 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2551 CopyInRC = SubRC;
2552
2553 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2554 return false;
2555 ARC = CopyInRC;
2556 }
2557
2558 if (!ARC)
2559 return false;
2560
2561 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2562
2563 // Rewrite the PHI's incoming values to ARC.
2564 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2565 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2566 MachineOperand &MO = PHI.getOperand(K);
2567 Register Reg = MO.getReg();
2568
2570 MachineBasicBlock *InsertMBB = nullptr;
2571
2572 // Look at the def of Reg, ignoring all copies.
2573 unsigned CopyOpc = AMDGPU::COPY;
2574 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2575
2576 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2577 // the copy was single-use, it will be removed by DCE later.
2578 if (Def->isCopy()) {
2579 Register AGPRSrc;
2580 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2581 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2582 MO.setReg(AGPRSrc);
2583 MO.setSubReg(AGPRSubReg);
2584 continue;
2585 }
2586
2587 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2588 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2589 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2590 // is unlikely to be profitable.
2591 //
2592 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2593 MachineOperand &CopyIn = Def->getOperand(1);
2594 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2595 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2596 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2597 }
2598
2599 InsertMBB = Def->getParent();
2600 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2601 } else {
2602 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2603 InsertPt = InsertMBB->getFirstTerminator();
2604 }
2605
2606 Register NewReg = MRI->createVirtualRegister(ARC);
2607 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2608 TII->get(CopyOpc), NewReg)
2609 .addReg(Reg);
2610 MO.setReg(NewReg);
2611
2612 (void)MI;
2613 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2614 }
2615
2616 // Replace the PHI's result with a new register.
2617 Register NewReg = MRI->createVirtualRegister(ARC);
2618 PHI.getOperand(0).setReg(NewReg);
2619
2620 // COPY that new register back to the original PhiOut register. This COPY will
2621 // usually be folded out later.
2622 MachineBasicBlock *MBB = PHI.getParent();
2623 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2624 TII->get(AMDGPU::COPY), PhiOut)
2625 .addReg(NewReg);
2626
2627 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2628 return true;
2629}
2630
2631// Attempt to convert VGPR load to an AGPR load.
2632bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2633 assert(MI.mayLoad());
2634 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2635 return false;
2636
2637 MachineOperand &Def = MI.getOperand(0);
2638 if (!Def.isDef())
2639 return false;
2640
2641 Register DefReg = Def.getReg();
2642
2643 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2644 return false;
2645
2647 llvm::make_pointer_range(MRI->use_nodbg_instructions(DefReg)));
2648 SmallVector<Register, 8> MoveRegs;
2649
2650 if (Users.empty())
2651 return false;
2652
2653 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2654 while (!Users.empty()) {
2655 const MachineInstr *I = Users.pop_back_val();
2656 if (!I->isCopy() && !I->isRegSequence())
2657 return false;
2658 Register DstReg = I->getOperand(0).getReg();
2659 // Physical registers may have more than one instruction definitions
2660 if (DstReg.isPhysical())
2661 return false;
2662 if (TRI->isAGPR(*MRI, DstReg))
2663 continue;
2664 MoveRegs.push_back(DstReg);
2665 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2666 Users.push_back(&U);
2667 }
2668
2669 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2670 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2671 if (!TII->isOperandLegal(MI, 0, &Def)) {
2672 MRI->setRegClass(DefReg, RC);
2673 return false;
2674 }
2675
2676 while (!MoveRegs.empty()) {
2677 Register Reg = MoveRegs.pop_back_val();
2678 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2679 }
2680
2681 LLVM_DEBUG(dbgs() << "Folded " << MI);
2682
2683 return true;
2684}
2685
2686// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2687// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2688// there's cases where it can create a lot more AGPR-AGPR copies, which are
2689// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2690//
2691// This function looks at all AGPR PHIs in a basic block and collects their
2692// operands. Then, it checks for register that are used more than once across
2693// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2694// having to create one VGPR temporary per use, which can get very messy if
2695// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2696// element).
2697//
2698// Example
2699// a:
2700// %in:agpr_256 = COPY %foo:vgpr_256
2701// c:
2702// %x:agpr_32 = ..
2703// b:
2704// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2705// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2706// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2707// =>
2708// a:
2709// %in:agpr_256 = COPY %foo:vgpr_256
2710// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2711// %tmp_agpr:agpr_32 = COPY %tmp
2712// c:
2713// %x:agpr_32 = ..
2714// b:
2715// %0:areg = PHI %tmp_agpr, %a, %x, %c
2716// %1:areg = PHI %tmp_agpr, %a, %y, %c
2717// %2:areg = PHI %tmp_agpr, %a, %z, %c
2718bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2719 // This is only really needed on GFX908 where AGPR-AGPR copies are
2720 // unreasonably difficult.
2721 if (ST->hasGFX90AInsts())
2722 return false;
2723
2724 // Look at all AGPR Phis and collect the register + subregister used.
2725 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2726 RegToMO;
2727
2728 for (auto &MI : MBB) {
2729 if (!MI.isPHI())
2730 break;
2731
2732 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2733 continue;
2734
2735 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2736 MachineOperand &PhiMO = MI.getOperand(K);
2737 if (!PhiMO.getSubReg())
2738 continue;
2739 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2740 }
2741 }
2742
2743 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2744 // a VGPR.
2745 bool Changed = false;
2746 for (const auto &[Entry, MOs] : RegToMO) {
2747 if (MOs.size() == 1)
2748 continue;
2749
2750 const auto [Reg, SubReg] = Entry;
2751 MachineInstr *Def = MRI->getVRegDef(Reg);
2752 MachineBasicBlock *DefMBB = Def->getParent();
2753
2754 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2755 // out.
2756 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2757 Register TempVGPR =
2758 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2759 MachineInstr *VGPRCopy =
2760 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2761 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2762 .addReg(Reg, /* flags */ 0, SubReg);
2763
2764 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2765 Register TempAGPR = MRI->createVirtualRegister(ARC);
2766 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2767 TII->get(AMDGPU::COPY), TempAGPR)
2768 .addReg(TempVGPR);
2769
2770 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2771 for (MachineOperand *MO : MOs) {
2772 MO->setReg(TempAGPR);
2773 MO->setSubReg(AMDGPU::NoSubRegister);
2774 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2775 }
2776
2777 Changed = true;
2778 }
2779
2780 return Changed;
2781}
2782
2783bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2784 this->MF = &MF;
2785 MRI = &MF.getRegInfo();
2786 ST = &MF.getSubtarget<GCNSubtarget>();
2787 TII = ST->getInstrInfo();
2788 TRI = &TII->getRegisterInfo();
2789 MFI = MF.getInfo<SIMachineFunctionInfo>();
2790
2791 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2792 // correctly handle signed zeros.
2793 //
2794 // FIXME: Also need to check strictfp
2795 bool IsIEEEMode = MFI->getMode().IEEE;
2796 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2797
2798 bool Changed = false;
2799 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2800 MachineOperand *CurrentKnownM0Val = nullptr;
2801 for (auto &MI : make_early_inc_range(*MBB)) {
2802 Changed |= tryFoldCndMask(MI);
2803
2804 if (tryFoldZeroHighBits(MI)) {
2805 Changed = true;
2806 continue;
2807 }
2808
2809 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2810 Changed = true;
2811 continue;
2812 }
2813
2814 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2815 Changed = true;
2816 continue;
2817 }
2818
2819 if (MI.mayLoad() && tryFoldLoad(MI)) {
2820 Changed = true;
2821 continue;
2822 }
2823
2824 if (TII->isFoldableCopy(MI)) {
2825 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2826 continue;
2827 }
2828
2829 // Saw an unknown clobber of m0, so we no longer know what it is.
2830 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2831 CurrentKnownM0Val = nullptr;
2832
2833 // TODO: Omod might be OK if there is NSZ only on the source
2834 // instruction, and not the omod multiply.
2835 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2836 !tryFoldOMod(MI))
2837 Changed |= tryFoldClamp(MI);
2838 }
2839
2840 Changed |= tryOptimizeAGPRPhis(*MBB);
2841 }
2842
2843 return Changed;
2844}
2845
2848 MFPropsModifier _(*this, MF);
2849
2850 bool Changed = SIFoldOperandsImpl().run(MF);
2851 if (!Changed) {
2852 return PreservedAnalyses::all();
2853 }
2855 PA.preserveSet<CFGAnalyses>();
2856 return PA;
2857}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasGFX90AInsts() const
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:169
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.