LLVM 23.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarryInsts())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 bool tryConstantFoldOp(MachineInstr *MI) const;
246 bool tryFoldCndMask(MachineInstr &MI) const;
247 bool tryFoldZeroHighBits(MachineInstr &MI) const;
248 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
249
250 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
251 bool tryFoldFoldableCopy(MachineInstr &MI,
252 MachineOperand *&CurrentKnownM0Val) const;
253
254 const MachineOperand *isClamp(const MachineInstr &MI) const;
255 bool tryFoldClamp(MachineInstr &MI);
256
257 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
258 bool tryFoldOMod(MachineInstr &MI);
259 bool tryFoldRegSequence(MachineInstr &MI);
260 bool tryFoldPhiAGPR(MachineInstr &MI);
261 bool tryFoldLoad(MachineInstr &MI);
262
263 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
264
265public:
266 SIFoldOperandsImpl() = default;
267
268 bool run(MachineFunction &MF);
269};
270
271class SIFoldOperandsLegacy : public MachineFunctionPass {
272public:
273 static char ID;
274
275 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
276
277 bool runOnMachineFunction(MachineFunction &MF) override {
278 if (skipFunction(MF.getFunction()))
279 return false;
280 return SIFoldOperandsImpl().run(MF);
281 }
282
283 StringRef getPassName() const override { return "SI Fold Operands"; }
284
285 void getAnalysisUsage(AnalysisUsage &AU) const override {
286 AU.setPreservesCFG();
288 }
289
290 MachineFunctionProperties getRequiredProperties() const override {
291 return MachineFunctionProperties().setIsSSA();
292 }
293};
294
295} // End anonymous namespace.
296
297INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
298 false)
299
300char SIFoldOperandsLegacy::ID = 0;
301
302char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
303
306 const MachineOperand &MO) {
307 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
308 if (const TargetRegisterClass *SubRC =
309 TRI.getSubRegisterClass(RC, MO.getSubReg()))
310 RC = SubRC;
311 return RC;
312}
313
314// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
315static unsigned macToMad(unsigned Opc) {
316 switch (Opc) {
317 case AMDGPU::V_MAC_F32_e64:
318 return AMDGPU::V_MAD_F32_e64;
319 case AMDGPU::V_MAC_F16_e64:
320 return AMDGPU::V_MAD_F16_e64;
321 case AMDGPU::V_FMAC_F32_e64:
322 return AMDGPU::V_FMA_F32_e64;
323 case AMDGPU::V_FMAC_F16_e64:
324 return AMDGPU::V_FMA_F16_gfx9_e64;
325 case AMDGPU::V_FMAC_F16_t16_e64:
326 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
327 case AMDGPU::V_FMAC_F16_fake16_e64:
328 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
329 case AMDGPU::V_FMAC_LEGACY_F32_e64:
330 return AMDGPU::V_FMA_LEGACY_F32_e64;
331 case AMDGPU::V_FMAC_F64_e64:
332 return AMDGPU::V_FMA_F64_e64;
333 }
334 return AMDGPU::INSTRUCTION_LIST_END;
335}
336
337// TODO: Add heuristic that the frame index might not fit in the addressing mode
338// immediate offset to avoid materializing in loops.
339bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
340 const FoldableDef &OpToFold) const {
341 if (!OpToFold.isFI())
342 return false;
343
344 const unsigned Opc = UseMI.getOpcode();
345 switch (Opc) {
346 case AMDGPU::S_ADD_I32:
347 case AMDGPU::S_ADD_U32:
348 case AMDGPU::V_ADD_U32_e32:
349 case AMDGPU::V_ADD_CO_U32_e32:
350 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
351 // to insert the wave size shift at every point we use the index.
352 // TODO: Fix depending on visit order to fold immediates into the operand
353 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
354 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
355 case AMDGPU::V_ADD_U32_e64:
356 case AMDGPU::V_ADD_CO_U32_e64:
357 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
358 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
359 default:
360 break;
361 }
362
363 if (TII->isMUBUF(UseMI))
364 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
365 if (!TII->isFLATScratch(UseMI))
366 return false;
367
368 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
369 if (OpNo == SIdx)
370 return true;
371
372 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
373 return OpNo == VIdx && SIdx == -1;
374}
375
376/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
377///
378/// => %vgpr = V_ADD_U32 x, frameindex
379bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
380 Register DstReg, Register SrcReg, MachineInstr &MI) const {
381 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
382 MRI->hasOneNonDBGUse(SrcReg)) {
383 MachineInstr *Def = MRI->getVRegDef(SrcReg);
384 if (!Def || Def->getNumOperands() != 4)
385 return false;
386
387 MachineOperand *Src0 = &Def->getOperand(1);
388 MachineOperand *Src1 = &Def->getOperand(2);
389
390 // TODO: This is profitable with more operand types, and for more
391 // opcodes. But ultimately this is working around poor / nonexistent
392 // regbankselect.
393 if (!Src0->isFI() && !Src1->isFI())
394 return false;
395
396 if (Src0->isFI())
397 std::swap(Src0, Src1);
398
399 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
400 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
401 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
402 !Def->getOperand(3).isDead()) // Check if scc is dead
403 return false;
404
405 MachineBasicBlock *MBB = Def->getParent();
406 const DebugLoc &DL = Def->getDebugLoc();
407 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
408 MachineInstrBuilder Add =
409 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
410
411 if (Add->getDesc().getNumDefs() == 2) {
412 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
413 Add.addDef(CarryOutReg, RegState::Dead);
414 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
415 }
416
417 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
418 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
419 Add.addImm(0);
420
421 Def->eraseFromParent();
422 MI.eraseFromParent();
423 return true;
424 }
425
426 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
427
429 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
430 if (Liveness == MachineBasicBlock::LQR_Dead) {
431 // TODO: If src1 satisfies operand constraints, use vop3 version.
432 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
433 .add(*Src0)
434 .add(*Src1)
435 .setOperandDead(3) // implicit-def $vcc
436 .setMIFlags(Def->getFlags());
437 Def->eraseFromParent();
438 MI.eraseFromParent();
439 return true;
440 }
441 }
442
443 return false;
444}
445
447 return new SIFoldOperandsLegacy();
448}
449
450bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
451 unsigned UseOpNo,
452 int64_t ImmVal) const {
453 const uint64_t TSFlags = MI->getDesc().TSFlags;
454
455 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
456 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
457 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
458 return false;
459
460 const MachineOperand &Old = MI->getOperand(UseOpNo);
461 int OpNo = MI->getOperandNo(&Old);
462
463 unsigned Opcode = MI->getOpcode();
464 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
465 switch (OpType) {
466 default:
467 return false;
475 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
476 // two different constants.
477 if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
478 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
479 return false;
480 break;
481 }
482
483 return true;
484}
485
486bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
487 int64_t ImmVal) const {
488 MachineOperand &Old = MI->getOperand(UseOpNo);
489 unsigned Opcode = MI->getOpcode();
490 int OpNo = MI->getOperandNo(&Old);
491 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
492
493 // If the literal can be inlined as-is, apply it and short-circuit the
494 // tests below. The main motivation for this is to avoid unintuitive
495 // uses of opsel.
496 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
497 Old.ChangeToImmediate(ImmVal);
498 return true;
499 }
500
501 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
502 // op_sel in a way that allows an inline constant.
503 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
504 unsigned SrcIdx = ~0;
505 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
506 ModName = AMDGPU::OpName::src0_modifiers;
507 SrcIdx = 0;
508 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
509 ModName = AMDGPU::OpName::src1_modifiers;
510 SrcIdx = 1;
511 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
512 ModName = AMDGPU::OpName::src2_modifiers;
513 SrcIdx = 2;
514 }
515 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
516 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
517 MachineOperand &Mod = MI->getOperand(ModIdx);
518 unsigned ModVal = Mod.getImm();
519
520 uint16_t ImmLo =
521 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
522 uint16_t ImmHi =
523 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
524 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
525 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
526
527 // Helper function that attempts to inline the given value with a newly
528 // chosen opsel pattern.
529 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
530 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
531 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
532 Old.ChangeToImmediate(Imm);
533 return true;
534 }
535
536 // Try to shuffle the halves around and leverage opsel to get an inline
537 // constant.
538 uint16_t Lo = static_cast<uint16_t>(Imm);
539 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
540 if (Lo == Hi) {
541 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
542 Mod.setImm(NewModVal);
544 return true;
545 }
546
547 if (static_cast<int16_t>(Lo) < 0) {
548 int32_t SExt = static_cast<int16_t>(Lo);
549 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
550 Mod.setImm(NewModVal);
551 Old.ChangeToImmediate(SExt);
552 return true;
553 }
554 }
555
556 // This check is only useful for integer instructions
557 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
558 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
559 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
560 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
561 return true;
562 }
563 }
564 } else {
565 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
566 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
567 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
568 Old.ChangeToImmediate(Swapped);
569 return true;
570 }
571 }
572
573 return false;
574 };
575
576 if (tryFoldToInline(Imm))
577 return true;
578
579 // Replace integer addition by subtraction and vice versa if it allows
580 // folding the immediate to an inline constant.
581 //
582 // We should only ever get here for SrcIdx == 1 due to canonicalization
583 // earlier in the pipeline, but we double-check here to be safe / fully
584 // general.
585 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
586 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
587 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
588 unsigned ClampIdx =
589 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
590 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
591
592 if (!Clamp) {
593 uint16_t NegLo = -static_cast<uint16_t>(Imm);
594 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
595 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
596
597 if (tryFoldToInline(NegImm)) {
598 unsigned NegOpcode =
599 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
600 MI->setDesc(TII->get(NegOpcode));
601 return true;
602 }
603 }
604 }
605
606 return false;
607}
608
609bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
610 MachineInstr *MI = Fold.UseMI;
611 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
612 assert(Old.isReg());
613
614 std::optional<int64_t> ImmVal;
615 if (Fold.isImm())
616 ImmVal = Fold.Def.getEffectiveImmVal();
617
618 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
619 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
620 return true;
621
622 // We can't represent the candidate as an inline constant. Try as a literal
623 // with the original opsel, checking constant bus limitations.
624 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
625 int OpNo = MI->getOperandNo(&Old);
626 if (!TII->isOperandLegal(*MI, OpNo, &New))
627 return false;
628 Old.ChangeToImmediate(*ImmVal);
629 return true;
630 }
631
632 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
633 MachineBasicBlock *MBB = MI->getParent();
634 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
635 if (Liveness != MachineBasicBlock::LQR_Dead) {
636 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
637 return false;
638 }
639
640 int Op32 = Fold.ShrinkOpcode;
641 MachineOperand &Dst0 = MI->getOperand(0);
642 MachineOperand &Dst1 = MI->getOperand(1);
643 assert(Dst0.isDef() && Dst1.isDef());
644
645 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
646
647 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
648 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
649
650 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
651
652 if (HaveNonDbgCarryUse) {
653 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
654 Dst1.getReg())
655 .addReg(AMDGPU::VCC, RegState::Kill);
656 }
657
658 // Keep the old instruction around to avoid breaking iterators, but
659 // replace it with a dummy instruction to remove uses.
660 //
661 // FIXME: We should not invert how this pass looks at operands to avoid
662 // this. Should track set of foldable movs instead of looking for uses
663 // when looking at a use.
664 Dst0.setReg(NewReg0);
665 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
666 MI->removeOperand(I);
667 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
668
669 if (Fold.Commuted)
670 TII->commuteInstruction(*Inst32, false);
671 return true;
672 }
673
674 assert(!Fold.needsShrink() && "not handled");
675
676 if (ImmVal) {
677 if (Old.isTied()) {
678 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
679 if (NewMFMAOpc == -1)
680 return false;
681 MI->setDesc(TII->get(NewMFMAOpc));
682 MI->untieRegOperand(0);
683 const MCInstrDesc &MCID = MI->getDesc();
684 for (unsigned I = 0; I < MI->getNumDefs(); ++I)
686 MI->getOperand(I).setIsEarlyClobber(true);
687 }
688
689 // TODO: Should we try to avoid adding this to the candidate list?
690 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
691 int OpNo = MI->getOperandNo(&Old);
692 if (!TII->isOperandLegal(*MI, OpNo, &New))
693 return false;
694
695 Old.ChangeToImmediate(*ImmVal);
696 return true;
697 }
698
699 if (Fold.isGlobal()) {
700 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
701 Fold.Def.OpToFold->getOffset(),
702 Fold.Def.OpToFold->getTargetFlags());
703 return true;
704 }
705
706 if (Fold.isFI()) {
707 Old.ChangeToFrameIndex(Fold.getFI());
708 return true;
709 }
710
711 MachineOperand *New = Fold.Def.OpToFold;
712
713 // Verify the register is compatible with the operand.
714 if (const TargetRegisterClass *OpRC =
715 TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
716 const TargetRegisterClass *NewRC =
717 TRI->getRegClassForReg(*MRI, New->getReg());
718
719 const TargetRegisterClass *ConstrainRC = OpRC;
720 if (New->getSubReg()) {
721 ConstrainRC =
722 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
723
724 if (!ConstrainRC)
725 return false;
726 }
727
728 if (New->getReg().isVirtual() &&
729 !MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
730 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
731 << TRI->getRegClassName(ConstrainRC) << '\n');
732 return false;
733 }
734 }
735
736 // Rework once the VS_16 register class is updated to include proper
737 // 16-bit SGPRs instead of 32-bit ones.
738 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
739 Old.setSubReg(AMDGPU::NoSubRegister);
740 if (New->getReg().isPhysical()) {
741 Old.substPhysReg(New->getReg(), *TRI);
742 } else {
743 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
744 Old.setIsUndef(New->isUndef());
745 }
746 return true;
747}
748
750 FoldCandidate &&Entry) {
751 // Skip additional folding on the same operand.
752 for (FoldCandidate &Fold : FoldList)
753 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
754 return;
755 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
756 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
757 FoldList.push_back(Entry);
758}
759
761 MachineInstr *MI, unsigned OpNo,
762 const FoldableDef &FoldOp,
763 bool Commuted = false, int ShrinkOp = -1) {
764 appendFoldCandidate(FoldList,
765 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
766}
767
768// Returns true if the instruction is a packed F32 instruction and the
769// corresponding scalar operand reads 32 bits and replicates the bits to both
770// channels.
772 const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) {
773 if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
774 return false;
775 const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
777}
778
779// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
780// literal) and replicates the bits to both channels. Therefore, if the hi and
781// lo are not same, we can't fold it.
783 const FoldableDef &OpToFold) {
784 assert(OpToFold.isImm() && "Expected immediate operand");
785 uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
786 uint32_t Lo = Lo_32(ImmVal);
787 uint32_t Hi = Hi_32(ImmVal);
788 return Lo == Hi;
789}
790
791bool SIFoldOperandsImpl::tryAddToFoldList(
792 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
793 const FoldableDef &OpToFold) const {
794 const unsigned Opc = MI->getOpcode();
795
796 auto tryToFoldAsFMAAKorMK = [&]() {
797 if (!OpToFold.isImm())
798 return false;
799
800 const bool TryAK = OpNo == 3;
801 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
802 MI->setDesc(TII->get(NewOpc));
803
804 // We have to fold into operand which would be Imm not into OpNo.
805 bool FoldAsFMAAKorMK =
806 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
807 if (FoldAsFMAAKorMK) {
808 // Untie Src2 of fmac.
809 MI->untieRegOperand(3);
810 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
811 if (OpNo == 1) {
812 MachineOperand &Op1 = MI->getOperand(1);
813 MachineOperand &Op2 = MI->getOperand(2);
814 Register OldReg = Op1.getReg();
815 // Operand 2 might be an inlinable constant
816 if (Op2.isImm()) {
817 Op1.ChangeToImmediate(Op2.getImm());
818 Op2.ChangeToRegister(OldReg, false);
819 } else {
820 Op1.setReg(Op2.getReg());
821 Op2.setReg(OldReg);
822 }
823 }
824 return true;
825 }
826 MI->setDesc(TII->get(Opc));
827 return false;
828 };
829
830 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
831 if (!IsLegal && OpToFold.isImm()) {
832 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
833 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
834 }
835
836 if (!IsLegal) {
837 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
838 unsigned NewOpc = macToMad(Opc);
839 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
840 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
841 // to fold the operand.
842 MI->setDesc(TII->get(NewOpc));
843 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
844 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
845 if (AddOpSel)
846 MI->addOperand(MachineOperand::CreateImm(0));
847 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
848 if (FoldAsMAD) {
849 MI->untieRegOperand(OpNo);
850 return true;
851 }
852 if (AddOpSel)
853 MI->removeOperand(MI->getNumExplicitOperands() - 1);
854 MI->setDesc(TII->get(Opc));
855 }
856
857 // Special case for s_fmac_f32 if we are trying to fold into Src2.
858 // By transforming into fmaak we can untie Src2 and make folding legal.
859 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
860 if (tryToFoldAsFMAAKorMK())
861 return true;
862 }
863
864 // Special case for s_setreg_b32
865 if (OpToFold.isImm()) {
866 unsigned ImmOpc = 0;
867 if (Opc == AMDGPU::S_SETREG_B32)
868 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
869 else if (Opc == AMDGPU::S_SETREG_B32_mode)
870 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
871 if (ImmOpc) {
872 MI->setDesc(TII->get(ImmOpc));
873 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
874 return true;
875 }
876 }
877
878 // Operand is not legal, so try to commute the instruction to
879 // see if this makes it possible to fold.
880 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
881 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
882 if (!CanCommute)
883 return false;
884
885 MachineOperand &Op = MI->getOperand(OpNo);
886 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
887
888 // One of operands might be an Imm operand, and OpNo may refer to it after
889 // the call of commuteInstruction() below. Such situations are avoided
890 // here explicitly as OpNo must be a register operand to be a candidate
891 // for memory folding.
892 if (!Op.isReg() || !CommutedOp.isReg())
893 return false;
894
895 // The same situation with an immediate could reproduce if both inputs are
896 // the same register.
897 if (Op.isReg() && CommutedOp.isReg() &&
898 (Op.getReg() == CommutedOp.getReg() &&
899 Op.getSubReg() == CommutedOp.getSubReg()))
900 return false;
901
902 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
903 return false;
904
905 int Op32 = -1;
906 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
907 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
908 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
909 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
910 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
911 return false;
912 }
913
914 // Verify the other operand is a VGPR, otherwise we would violate the
915 // constant bus restriction.
916 MachineOperand &OtherOp = MI->getOperand(OpNo);
917 if (!OtherOp.isReg() ||
918 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
919 return false;
920
921 assert(MI->getOperand(1).isDef());
922
923 // Make sure to get the 32-bit version of the commuted opcode.
924 unsigned MaybeCommutedOpc = MI->getOpcode();
925 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
926 }
927
928 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
929 Op32);
930 return true;
931 }
932
933 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
934 // By changing into fmamk we can untie Src2.
935 // If folding for Src0 happens first and it is identical operand to Src1 we
936 // should avoid transforming into fmamk which requires commuting as it would
937 // cause folding into Src1 to fail later on due to wrong OpNo used.
938 if (Opc == AMDGPU::S_FMAC_F32 &&
939 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
940 if (tryToFoldAsFMAAKorMK())
941 return true;
942 }
943
944 // Special case for PK_F32 instructions if we are trying to fold an imm to
945 // src0 or src1.
946 if (OpToFold.isImm() &&
949 return false;
950
951 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
952 return true;
953}
954
955bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
956 const MachineOperand &UseMO) const {
957 // Operands of SDWA instructions must be registers.
958 return !TII->isSDWA(MI);
959}
960
963 Register SrcReg) {
964 MachineOperand *Sub = nullptr;
965 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
966 SubDef && TII.isFoldableCopy(*SubDef);
967 SubDef = MRI.getVRegDef(Sub->getReg())) {
968 unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
969 MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
970
971 if (SrcOp.isImm())
972 return &SrcOp;
973 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
974 break;
975 Sub = &SrcOp;
976 // TODO: Support compose
977 if (SrcOp.getSubReg())
978 break;
979 }
980
981 return Sub;
982}
983
984const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
985 MachineInstr &RegSeq,
986 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
987
988 assert(RegSeq.isRegSequence());
989
990 const TargetRegisterClass *RC = nullptr;
991
992 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
993 MachineOperand &SrcOp = RegSeq.getOperand(I);
994 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
995
996 // Only accept reg_sequence with uniform reg class inputs for simplicity.
997 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
998 if (!RC)
999 RC = OpRC;
1000 else if (!TRI->getCommonSubClass(RC, OpRC))
1001 return nullptr;
1002
1003 if (SrcOp.getSubReg()) {
1004 // TODO: Handle subregister compose
1005 Defs.emplace_back(&SrcOp, SubRegIdx);
1006 continue;
1007 }
1008
1009 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
1010 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
1011 Defs.emplace_back(DefSrc, SubRegIdx);
1012 continue;
1013 }
1014
1015 Defs.emplace_back(&SrcOp, SubRegIdx);
1016 }
1017
1018 return RC;
1019}
1020
1021// Find a def of the UseReg, check if it is a reg_sequence and find initializers
1022// for each subreg, tracking it to an immediate if possible. Returns the
1023// register class of the inputs on success.
1024const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
1025 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
1026 Register UseReg) const {
1027 MachineInstr *Def = MRI->getVRegDef(UseReg);
1028 if (!Def || !Def->isRegSequence())
1029 return nullptr;
1030
1031 return getRegSeqInit(*Def, Defs);
1032}
1033
1034std::pair<int64_t, const TargetRegisterClass *>
1035SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1037 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1038 if (!SrcRC)
1039 return {};
1040
1041 bool TryToMatchSplat64 = false;
1042
1043 int64_t Imm;
1044 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1045 const MachineOperand *Op = Defs[I].first;
1046 if (!Op->isImm())
1047 return {};
1048
1049 int64_t SubImm = Op->getImm();
1050 if (!I) {
1051 Imm = SubImm;
1052 continue;
1053 }
1054
1055 if (Imm != SubImm) {
1056 if (I == 1 && (E & 1) == 0) {
1057 // If we have an even number of inputs, there's a chance this is a
1058 // 64-bit element splat broken into 32-bit pieces.
1059 TryToMatchSplat64 = true;
1060 break;
1061 }
1062
1063 return {}; // Can only fold splat constants
1064 }
1065 }
1066
1067 if (!TryToMatchSplat64)
1068 return {Defs[0].first->getImm(), SrcRC};
1069
1070 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1071 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1072 int64_t SplatVal64;
1073 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1074 const MachineOperand *Op0 = Defs[I].first;
1075 const MachineOperand *Op1 = Defs[I + 1].first;
1076
1077 if (!Op0->isImm() || !Op1->isImm())
1078 return {};
1079
1080 unsigned SubReg0 = Defs[I].second;
1081 unsigned SubReg1 = Defs[I + 1].second;
1082
1083 // Assume we're going to generally encounter reg_sequences with sorted
1084 // subreg indexes, so reject any that aren't consecutive.
1085 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1086 TRI->getChannelFromSubReg(SubReg1))
1087 return {};
1088
1089 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1090 if (I == 0)
1091 SplatVal64 = MergedVal;
1092 else if (SplatVal64 != MergedVal)
1093 return {};
1094 }
1095
1096 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1097 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1098
1099 return {SplatVal64, RC64};
1100}
1101
1102bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1103 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1104 const TargetRegisterClass *SplatRC) const {
1105 const MCInstrDesc &Desc = UseMI->getDesc();
1106 if (UseOpIdx >= Desc.getNumOperands())
1107 return false;
1108
1109 // Filter out unhandled pseudos.
1110 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1111 return false;
1112
1113 int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]);
1114 if (RCID == -1)
1115 return false;
1116
1117 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1118
1119 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1120 // have the same bits. These are the only cases where a splat has the same
1121 // interpretation for 32-bit and 64-bit splats.
1122 if (SplatVal != 0 && SplatVal != -1) {
1123 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1124 // operand will be AReg_128, and we want to check if it's compatible with an
1125 // AReg_32 constant.
1126 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1127 switch (OpTy) {
1132 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1133 break;
1137 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1138 break;
1139 default:
1140 return false;
1141 }
1142
1143 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1144 return false;
1145 }
1146
1147 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1148 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1149 return false;
1150
1151 return true;
1152}
1153
1154bool SIFoldOperandsImpl::tryToFoldACImm(
1155 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1156 SmallVectorImpl<FoldCandidate> &FoldList) const {
1157 const MCInstrDesc &Desc = UseMI->getDesc();
1158 if (UseOpIdx >= Desc.getNumOperands())
1159 return false;
1160
1161 // Filter out unhandled pseudos.
1162 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1163 return false;
1164
1165 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1168 return false;
1169 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1170 return true;
1171 }
1172
1173 return false;
1174}
1175
1176void SIFoldOperandsImpl::foldOperand(
1177 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1178 SmallVectorImpl<FoldCandidate> &FoldList,
1179 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1180 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1181
1182 if (!isUseSafeToFold(*UseMI, *UseOp))
1183 return;
1184
1185 // FIXME: Fold operands with subregs.
1186 if (UseOp->isReg() && OpToFold.isReg()) {
1187 if (UseOp->isImplicit())
1188 return;
1189 // Allow folding from SGPRs to 16-bit VGPRs.
1190 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1191 (UseOp->getSubReg() != AMDGPU::lo16 ||
1192 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1193 return;
1194 }
1195
1196 // Special case for REG_SEQUENCE: We can't fold literals into
1197 // REG_SEQUENCE instructions, so we have to fold them into the
1198 // uses of REG_SEQUENCE.
1199 if (UseMI->isRegSequence()) {
1200 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1201 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1202
1203 int64_t SplatVal;
1204 const TargetRegisterClass *SplatRC;
1205 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1206
1207 // Grab the use operands first
1209 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1210 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1211 MachineOperand *RSUse = UsesToProcess[I];
1212 MachineInstr *RSUseMI = RSUse->getParent();
1213 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1214
1215 if (SplatRC) {
1216 if (RSUseMI->isCopy()) {
1217 Register DstReg = RSUseMI->getOperand(0).getReg();
1218 append_range(UsesToProcess,
1219 make_pointer_range(MRI->use_nodbg_operands(DstReg)));
1220 continue;
1221 }
1222 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1223 FoldableDef SplatDef(SplatVal, SplatRC);
1224 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1225 continue;
1226 }
1227 }
1228
1229 // TODO: Handle general compose
1230 if (RSUse->getSubReg() != RegSeqDstSubReg)
1231 continue;
1232
1233 // FIXME: We should avoid recursing here. There should be a cleaner split
1234 // between the in-place mutations and adding to the fold list.
1235 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1236 CopiesToReplace);
1237 }
1238
1239 return;
1240 }
1241
1242 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1243 return;
1244
1245 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1246 // Verify that this is a stack access.
1247 // FIXME: Should probably use stack pseudos before frame lowering.
1248
1249 if (TII->isMUBUF(*UseMI)) {
1250 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1251 MFI->getScratchRSrcReg())
1252 return;
1253
1254 // Ensure this is either relative to the current frame or the current
1255 // wave.
1256 MachineOperand &SOff =
1257 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1258 if (!SOff.isImm() || SOff.getImm() != 0)
1259 return;
1260 }
1261
1262 const unsigned Opc = UseMI->getOpcode();
1263 if (TII->isFLATScratch(*UseMI) &&
1264 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1265 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1266 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1267 unsigned CPol =
1268 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1269 if ((CPol & AMDGPU::CPol::SCAL) &&
1271 return;
1272
1273 UseMI->setDesc(TII->get(NewOpc));
1274 }
1275
1276 // A frame index will resolve to a positive constant, so it should always be
1277 // safe to fold the addressing mode, even pre-GFX9.
1278 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1279
1280 return;
1281 }
1282
1283 bool FoldingImmLike =
1284 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1285
1286 if (FoldingImmLike && UseMI->isCopy()) {
1287 Register DestReg = UseMI->getOperand(0).getReg();
1288 Register SrcReg = UseMI->getOperand(1).getReg();
1289 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1290 assert(SrcReg.isVirtual());
1291
1292 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1293
1294 // Don't fold into a copy to a physical register with the same class. Doing
1295 // so would interfere with the register coalescer's logic which would avoid
1296 // redundant initializations.
1297 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1298 return;
1299
1300 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1301 // In order to fold immediates into copies, we need to change the copy to a
1302 // MOV. Find a compatible mov instruction with the value.
1303 for (unsigned MovOp :
1304 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1305 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1306 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1307 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1308 const MCInstrDesc &MovDesc = TII->get(MovOp);
1309 const TargetRegisterClass *MovDstRC =
1310 TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0]));
1311
1312 // Fold if the destination register class of the MOV instruction (ResRC)
1313 // is a superclass of (or equal to) the destination register class of the
1314 // COPY (DestRC). If this condition fails, folding would be illegal.
1315 if (!DestRC->hasSuperClassEq(MovDstRC))
1316 continue;
1317
1318 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1319
1320 int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]);
1321 if (RegClassID != -1) {
1322 const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
1323
1324 if (UseSubReg)
1325 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1326
1327 // FIXME: We should be able to directly check immediate operand legality
1328 // for all cases, but gfx908 hacks break.
1329 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1330 (!OpToFold.isImm() ||
1331 !TII->isImmOperandLegal(MovDesc, SrcIdx,
1332 *OpToFold.getEffectiveImmVal())))
1333 break;
1334
1335 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1336 break;
1337
1338 // FIXME: This is mutating the instruction only and deferring the actual
1339 // fold of the immediate
1340 } else {
1341 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1342 // immediate to verify. Technically we should always verify this, but it
1343 // only matters for these concrete cases.
1344 // TODO: Handle non-imm case if it's useful.
1345 if (!OpToFold.isImm() ||
1346 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1347 break;
1348 }
1349
1352 while (ImpOpI != ImpOpE) {
1353 MachineInstr::mop_iterator Tmp = ImpOpI;
1354 ImpOpI++;
1356 }
1357 UseMI->setDesc(MovDesc);
1358
1359 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1360 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1361 MachineOperand NewSrcOp(SrcOp);
1362 MachineFunction *MF = UseMI->getMF();
1363 UseMI->removeOperand(1);
1364 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1365 UseMI->addOperand(NewSrcOp); // src0
1366 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1367 UseOpIdx = SrcIdx;
1368 UseOp = &UseMI->getOperand(UseOpIdx);
1369 }
1370 CopiesToReplace.push_back(UseMI);
1371 break;
1372 }
1373
1374 // We failed to replace the copy, so give up.
1375 if (UseMI->getOpcode() == AMDGPU::COPY)
1376 return;
1377
1378 } else {
1379 if (UseMI->isCopy() && OpToFold.isReg() &&
1380 UseMI->getOperand(0).getReg().isVirtual() &&
1381 !UseMI->getOperand(1).getSubReg() &&
1382 OpToFold.DefMI->implicit_operands().empty()) {
1383 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1384 << *UseMI);
1385 unsigned Size = TII->getOpSize(*UseMI, 1);
1386 Register UseReg = OpToFold.getReg();
1388 unsigned SubRegIdx = OpToFold.getSubReg();
1389 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1390 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1391 // VS_16RegClass
1392 //
1393 // Excerpt from AMDGPUGenRegisterInfoEnums.inc
1394 // NoSubRegister, //0
1395 // hi16, // 1
1396 // lo16, // 2
1397 // sub0, // 3
1398 // ...
1399 // sub1, // 11
1400 // sub1_hi16, // 12
1401 // sub1_lo16, // 13
1402 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1403 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1404 TRI->isSGPRReg(*MRI, UseReg)) {
1405 // Produce the 32 bit subregister index to which the 16-bit subregister
1406 // is aligned.
1407 if (SubRegIdx > AMDGPU::sub1) {
1408 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1409 M |= M.getLane(M.getHighestLane() - 1);
1410 SmallVector<unsigned, 4> Indexes;
1411 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1412 Indexes);
1413 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1414 SubRegIdx = Indexes[0];
1415 // 32-bit registers do not have a sub0 index
1416 } else if (TII->getOpSize(*UseMI, 1) == 4)
1417 SubRegIdx = 0;
1418 else
1419 SubRegIdx = AMDGPU::sub0;
1420 }
1421 UseMI->getOperand(1).setSubReg(SubRegIdx);
1422 UseMI->getOperand(1).setIsKill(false);
1423 CopiesToReplace.push_back(UseMI);
1424 OpToFold.OpToFold->setIsKill(false);
1425
1426 // Remove kill flags as kills may now be out of order with uses.
1427 MRI->clearKillFlags(UseReg);
1428 if (foldCopyToAGPRRegSequence(UseMI))
1429 return;
1430 }
1431
1432 unsigned UseOpc = UseMI->getOpcode();
1433 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1434 (UseOpc == AMDGPU::V_READLANE_B32 &&
1435 (int)UseOpIdx ==
1436 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1437 // %vgpr = V_MOV_B32 imm
1438 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1439 // =>
1440 // %sgpr = S_MOV_B32 imm
1441 if (FoldingImmLike) {
1443 UseMI->getOperand(UseOpIdx).getReg(),
1444 *OpToFold.DefMI, *UseMI))
1445 return;
1446
1447 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1449
1450 if (OpToFold.isImm()) {
1452 *OpToFold.getEffectiveImmVal());
1453 } else if (OpToFold.isFI())
1454 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1455 else {
1456 assert(OpToFold.isGlobal());
1457 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1458 OpToFold.OpToFold->getOffset(),
1459 OpToFold.OpToFold->getTargetFlags());
1460 }
1461 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1462 return;
1463 }
1464
1465 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1467 UseMI->getOperand(UseOpIdx).getReg(),
1468 *OpToFold.DefMI, *UseMI))
1469 return;
1470
1471 // %vgpr = COPY %sgpr0
1472 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1473 // =>
1474 // %sgpr1 = COPY %sgpr0
1475 UseMI->setDesc(TII->get(AMDGPU::COPY));
1476 UseMI->getOperand(1).setReg(OpToFold.getReg());
1477 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1478 UseMI->getOperand(1).setIsKill(false);
1479 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1481 return;
1482 }
1483 }
1484
1485 const MCInstrDesc &UseDesc = UseMI->getDesc();
1486
1487 // Don't fold into target independent nodes. Target independent opcodes
1488 // don't have defined register classes.
1489 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1490 UseDesc.operands()[UseOpIdx].RegClass == -1)
1491 return;
1492 }
1493
1494 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1495 // to enable more folding opportunities. The shrink operands pass
1496 // already does this.
1497
1498 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1499}
1500
1501static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1503 switch (Opcode) {
1504 case AMDGPU::V_AND_B32_e64:
1505 case AMDGPU::V_AND_B32_e32:
1506 case AMDGPU::S_AND_B32:
1507 Result = LHS & RHS;
1508 return true;
1509 case AMDGPU::V_OR_B32_e64:
1510 case AMDGPU::V_OR_B32_e32:
1511 case AMDGPU::S_OR_B32:
1512 Result = LHS | RHS;
1513 return true;
1514 case AMDGPU::V_XOR_B32_e64:
1515 case AMDGPU::V_XOR_B32_e32:
1516 case AMDGPU::S_XOR_B32:
1517 Result = LHS ^ RHS;
1518 return true;
1519 case AMDGPU::S_XNOR_B32:
1520 Result = ~(LHS ^ RHS);
1521 return true;
1522 case AMDGPU::S_NAND_B32:
1523 Result = ~(LHS & RHS);
1524 return true;
1525 case AMDGPU::S_NOR_B32:
1526 Result = ~(LHS | RHS);
1527 return true;
1528 case AMDGPU::S_ANDN2_B32:
1529 Result = LHS & ~RHS;
1530 return true;
1531 case AMDGPU::S_ORN2_B32:
1532 Result = LHS | ~RHS;
1533 return true;
1534 case AMDGPU::V_LSHL_B32_e64:
1535 case AMDGPU::V_LSHL_B32_e32:
1536 case AMDGPU::S_LSHL_B32:
1537 // The instruction ignores the high bits for out of bounds shifts.
1538 Result = LHS << (RHS & 31);
1539 return true;
1540 case AMDGPU::V_LSHLREV_B32_e64:
1541 case AMDGPU::V_LSHLREV_B32_e32:
1542 Result = RHS << (LHS & 31);
1543 return true;
1544 case AMDGPU::V_LSHR_B32_e64:
1545 case AMDGPU::V_LSHR_B32_e32:
1546 case AMDGPU::S_LSHR_B32:
1547 Result = LHS >> (RHS & 31);
1548 return true;
1549 case AMDGPU::V_LSHRREV_B32_e64:
1550 case AMDGPU::V_LSHRREV_B32_e32:
1551 Result = RHS >> (LHS & 31);
1552 return true;
1553 case AMDGPU::V_ASHR_I32_e64:
1554 case AMDGPU::V_ASHR_I32_e32:
1555 case AMDGPU::S_ASHR_I32:
1556 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1557 return true;
1558 case AMDGPU::V_ASHRREV_I32_e64:
1559 case AMDGPU::V_ASHRREV_I32_e32:
1560 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1561 return true;
1562 default:
1563 return false;
1564 }
1565}
1566
1567static unsigned getMovOpc(bool IsScalar) {
1568 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1569}
1570
1571// Try to simplify operations with a constant that may appear after instruction
1572// selection.
1573// TODO: See if a frame index with a fixed offset can fold.
1574bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1575 if (!MI->allImplicitDefsAreDead())
1576 return false;
1577
1578 unsigned Opc = MI->getOpcode();
1579
1580 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1581 if (Src0Idx == -1)
1582 return false;
1583
1584 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1585 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1586
1587 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1588 Opc == AMDGPU::S_NOT_B32) &&
1589 Src0Imm) {
1590 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1591 TII->mutateAndCleanupImplicit(
1592 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1593 return true;
1594 }
1595
1596 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1597 if (Src1Idx == -1)
1598 return false;
1599
1600 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1601 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1602
1603 if (!Src0Imm && !Src1Imm)
1604 return false;
1605
1606 // and k0, k1 -> v_mov_b32 (k0 & k1)
1607 // or k0, k1 -> v_mov_b32 (k0 | k1)
1608 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1609 if (Src0Imm && Src1Imm) {
1610 int32_t NewImm;
1611 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1612 return false;
1613
1614 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1615
1616 // Be careful to change the right operand, src0 may belong to a different
1617 // instruction.
1618 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1619 MI->removeOperand(Src1Idx);
1620 TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR)));
1621 return true;
1622 }
1623
1624 if (!MI->isCommutable())
1625 return false;
1626
1627 if (Src0Imm && !Src1Imm) {
1628 std::swap(Src0, Src1);
1629 std::swap(Src0Idx, Src1Idx);
1630 std::swap(Src0Imm, Src1Imm);
1631 }
1632
1633 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1634 if (Opc == AMDGPU::V_OR_B32_e64 ||
1635 Opc == AMDGPU::V_OR_B32_e32 ||
1636 Opc == AMDGPU::S_OR_B32) {
1637 if (Src1Val == 0) {
1638 // y = or x, 0 => y = copy x
1639 MI->removeOperand(Src1Idx);
1640 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1641 } else if (Src1Val == -1) {
1642 // y = or x, -1 => y = v_mov_b32 -1
1643 MI->removeOperand(Src1Idx);
1644 TII->mutateAndCleanupImplicit(
1645 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1646 } else
1647 return false;
1648
1649 return true;
1650 }
1651
1652 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1653 Opc == AMDGPU::S_AND_B32) {
1654 if (Src1Val == 0) {
1655 // y = and x, 0 => y = v_mov_b32 0
1656 MI->removeOperand(Src0Idx);
1657 TII->mutateAndCleanupImplicit(
1658 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1659 } else if (Src1Val == -1) {
1660 // y = and x, -1 => y = copy x
1661 MI->removeOperand(Src1Idx);
1662 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1663 } else
1664 return false;
1665
1666 return true;
1667 }
1668
1669 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1670 Opc == AMDGPU::S_XOR_B32) {
1671 if (Src1Val == 0) {
1672 // y = xor x, 0 => y = copy x
1673 MI->removeOperand(Src1Idx);
1674 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1675 return true;
1676 }
1677 }
1678
1679 return false;
1680}
1681
1682// Try to fold an instruction into a simpler one
1683bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1684 unsigned Opc = MI.getOpcode();
1685 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1686 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1687 return false;
1688
1689 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1690 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1691 if (!Src1->isIdenticalTo(*Src0)) {
1692 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1693 if (!Src1Imm)
1694 return false;
1695
1696 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1697 if (!Src0Imm || *Src0Imm != *Src1Imm)
1698 return false;
1699 }
1700
1701 int Src1ModIdx =
1702 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1703 int Src0ModIdx =
1704 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1705 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1706 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1707 return false;
1708
1709 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1710 auto &NewDesc =
1711 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1712 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1713 if (Src2Idx != -1)
1714 MI.removeOperand(Src2Idx);
1715 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1716 if (Src1ModIdx != -1)
1717 MI.removeOperand(Src1ModIdx);
1718 if (Src0ModIdx != -1)
1719 MI.removeOperand(Src0ModIdx);
1720 TII->mutateAndCleanupImplicit(MI, NewDesc);
1721 LLVM_DEBUG(dbgs() << MI);
1722 return true;
1723}
1724
1725bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1726 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1727 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1728 return false;
1729
1730 std::optional<int64_t> Src0Imm =
1731 TII->getImmOrMaterializedImm(MI.getOperand(1));
1732 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1733 return false;
1734
1735 Register Src1 = MI.getOperand(2).getReg();
1736 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1737 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1738 return false;
1739
1740 Register Dst = MI.getOperand(0).getReg();
1741 MRI->replaceRegWith(Dst, Src1);
1742 if (!MI.getOperand(2).isKill())
1743 MRI->clearKillFlags(Src1);
1744 MI.eraseFromParent();
1745 return true;
1746}
1747
1748bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1749 const FoldableDef &OpToFold) const {
1750 // We need mutate the operands of new mov instructions to add implicit
1751 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1752 // this.
1753 SmallVector<MachineInstr *, 4> CopiesToReplace;
1755 MachineOperand &Dst = MI.getOperand(0);
1756 bool Changed = false;
1757
1758 if (OpToFold.isImm()) {
1759 for (auto &UseMI :
1760 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1761 // Folding the immediate may reveal operations that can be constant
1762 // folded or replaced with a copy. This can happen for example after
1763 // frame indices are lowered to constants or from splitting 64-bit
1764 // constants.
1765 //
1766 // We may also encounter cases where one or both operands are
1767 // immediates materialized into a register, which would ordinarily not
1768 // be folded due to multiple uses or operand constraints.
1769 if (tryConstantFoldOp(&UseMI)) {
1770 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1771 Changed = true;
1772 }
1773 }
1774 }
1775
1777 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1778 for (auto *U : UsesToProcess) {
1779 MachineInstr *UseMI = U->getParent();
1780
1781 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1782 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1783 CopiesToReplace);
1784 }
1785
1786 if (CopiesToReplace.empty() && FoldList.empty())
1787 return Changed;
1788
1789 MachineFunction *MF = MI.getMF();
1790 // Make sure we add EXEC uses to any new v_mov instructions created.
1791 for (MachineInstr *Copy : CopiesToReplace)
1792 Copy->addImplicitDefUseOperands(*MF);
1793
1794 SetVector<MachineInstr *> ConstantFoldCandidates;
1795 for (FoldCandidate &Fold : FoldList) {
1796 assert(!Fold.isReg() || Fold.Def.OpToFold);
1797 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1798 Register Reg = Fold.getReg();
1799 const MachineInstr *DefMI = Fold.Def.DefMI;
1800 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1801 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1802 continue;
1803 }
1804 if (updateOperand(Fold)) {
1805 // Clear kill flags.
1806 if (Fold.isReg()) {
1807 assert(Fold.Def.OpToFold && Fold.isReg());
1808 // FIXME: Probably shouldn't bother trying to fold if not an
1809 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1810 // copies.
1811 MRI->clearKillFlags(Fold.getReg());
1812 }
1813 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1814 << static_cast<int>(Fold.UseOpNo) << " of "
1815 << *Fold.UseMI);
1816
1817 if (Fold.isImm())
1818 ConstantFoldCandidates.insert(Fold.UseMI);
1819
1820 } else if (Fold.Commuted) {
1821 // Restoring instruction's original operand order if fold has failed.
1822 TII->commuteInstruction(*Fold.UseMI, false);
1823 }
1824 }
1825
1826 for (MachineInstr *MI : ConstantFoldCandidates) {
1827 if (tryConstantFoldOp(MI)) {
1828 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1829 Changed = true;
1830 }
1831 }
1832 return true;
1833}
1834
1835/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1836/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1837bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1838 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1839 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1840 // initializers right here, so we will rematerialize immediates and avoid
1841 // copies via different reg classes.
1842 const TargetRegisterClass *DefRC =
1843 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1844 if (!TRI->isAGPRClass(DefRC))
1845 return false;
1846
1847 Register UseReg = CopyMI->getOperand(1).getReg();
1848 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1849 if (!RegSeq || !RegSeq->isRegSequence())
1850 return false;
1851
1852 const DebugLoc &DL = CopyMI->getDebugLoc();
1853 MachineBasicBlock &MBB = *CopyMI->getParent();
1854
1855 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1856 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1857
1858 const TargetRegisterClass *UseRC =
1859 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1860
1861 // Value, subregindex for new REG_SEQUENCE
1863
1864 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1865 unsigned NumFoldable = 0;
1866
1867 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1868 MachineOperand &RegOp = RegSeq->getOperand(I);
1869 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1870
1871 if (RegOp.getSubReg()) {
1872 // TODO: Handle subregister compose
1873 NewDefs.emplace_back(&RegOp, SubRegIdx);
1874 continue;
1875 }
1876
1877 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1878 if (!Lookup)
1879 Lookup = &RegOp;
1880
1881 if (Lookup->isImm()) {
1882 // Check if this is an agpr_32 subregister.
1883 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1884 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1885 if (DestSuperRC &&
1886 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1887 ++NumFoldable;
1888 NewDefs.emplace_back(Lookup, SubRegIdx);
1889 continue;
1890 }
1891 }
1892
1893 const TargetRegisterClass *InputRC =
1894 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1895 : MRI->getRegClass(RegOp.getReg());
1896
1897 // TODO: Account for Lookup->getSubReg()
1898
1899 // If we can't find a matching super class, this is an SGPR->AGPR or
1900 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1901 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1902 // want to rewrite to copy to an intermediate VGPR class.
1903 const TargetRegisterClass *MatchRC =
1904 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1905 if (!MatchRC) {
1906 ++NumFoldable;
1907 NewDefs.emplace_back(&RegOp, SubRegIdx);
1908 continue;
1909 }
1910
1911 NewDefs.emplace_back(&RegOp, SubRegIdx);
1912 }
1913
1914 // Do not clone a reg_sequence and merely change the result register class.
1915 if (NumFoldable == 0)
1916 return false;
1917
1918 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1919 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1920 CopyMI->removeOperand(I);
1921
1922 for (auto [Def, DestSubIdx] : NewDefs) {
1923 if (!Def->isReg()) {
1924 // TODO: Should we use single write for each repeated value like in
1925 // register case?
1926 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1927 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1928 .add(*Def);
1929 B.addReg(Tmp);
1930 } else {
1931 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1932 Def->setIsKill(false);
1933
1934 Register &VGPRCopy = VGPRCopies[Src];
1935 if (!VGPRCopy) {
1936 const TargetRegisterClass *VGPRUseSubRC =
1937 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1938
1939 // We cannot build a reg_sequence out of the same registers, they
1940 // must be copied. Better do it here before copyPhysReg() created
1941 // several reads to do the AGPR->VGPR->AGPR copy.
1942
1943 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1944 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1945 // later, create a copy here and track if we already have such a copy.
1946 const TargetRegisterClass *SubRC =
1947 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1948 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1949 // TODO: Try to reconstrain class
1950 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1951 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1952 B.addReg(VGPRCopy);
1953 } else {
1954 // If it is already a VGPR, do not copy the register.
1955 B.add(*Def);
1956 }
1957 } else {
1958 B.addReg(VGPRCopy);
1959 }
1960 }
1961
1962 B.addImm(DestSubIdx);
1963 }
1964
1965 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
1966 return true;
1967}
1968
1969bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1970 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1971 Register DstReg = MI.getOperand(0).getReg();
1972 // Specially track simple redefs of m0 to the same value in a block, so we
1973 // can erase the later ones.
1974 if (DstReg == AMDGPU::M0) {
1975 MachineOperand &NewM0Val = MI.getOperand(1);
1976 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1977 MI.eraseFromParent();
1978 return true;
1979 }
1980
1981 // We aren't tracking other physical registers
1982 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1983 ? nullptr
1984 : &NewM0Val;
1985 return false;
1986 }
1987
1988 MachineOperand *OpToFoldPtr;
1989 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1990 // Folding when any src_modifiers are non-zero is unsupported
1991 if (TII->hasAnyModifiersSet(MI))
1992 return false;
1993 OpToFoldPtr = &MI.getOperand(2);
1994 } else
1995 OpToFoldPtr = &MI.getOperand(1);
1996 MachineOperand &OpToFold = *OpToFoldPtr;
1997 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1998
1999 // FIXME: We could also be folding things like TargetIndexes.
2000 if (!FoldingImm && !OpToFold.isReg())
2001 return false;
2002
2003 // Fold virtual registers and constant physical registers.
2004 if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
2005 !TRI->isConstantPhysReg(OpToFold.getReg()))
2006 return false;
2007
2008 // Prevent folding operands backwards in the function. For example,
2009 // the COPY opcode must not be replaced by 1 in this example:
2010 //
2011 // %3 = COPY %vgpr0; VGPR_32:%3
2012 // ...
2013 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2014 if (!DstReg.isVirtual())
2015 return false;
2016
2017 const TargetRegisterClass *DstRC =
2018 MRI->getRegClass(MI.getOperand(0).getReg());
2019
2020 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2021 // Can remove this code if proper 16-bit SGPRs are implemented
2022 // Example: Pre-peephole-opt
2023 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2024 // %32:sreg_32 = COPY %29:sgpr_lo16
2025 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2026 // Post-peephole-opt and DCE
2027 // %32:sreg_32 = COPY %16.lo16:sreg_32
2028 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2029 // After this transform
2030 // %32:sreg_32 = COPY %16:sreg_32
2031 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2032 // After the fold operands pass
2033 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2034 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2035 OpToFold.getSubReg()) {
2036 if (DstRC == &AMDGPU::SReg_32RegClass &&
2037 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2038 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2039 OpToFold.setSubReg(0);
2040 }
2041 }
2042
2043 // Fold copy to AGPR through reg_sequence
2044 // TODO: Handle with subregister extract
2045 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2046 if (foldCopyToAGPRRegSequence(&MI))
2047 return true;
2048 }
2049
2050 FoldableDef Def(OpToFold, DstRC);
2051 bool Changed = foldInstOperand(MI, Def);
2052
2053 // If we managed to fold all uses of this copy then we might as well
2054 // delete it now.
2055 // The only reason we need to follow chains of copies here is that
2056 // tryFoldRegSequence looks forward through copies before folding a
2057 // REG_SEQUENCE into its eventual users.
2058 auto *InstToErase = &MI;
2059 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2060 auto &SrcOp = InstToErase->getOperand(1);
2061 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2062 InstToErase->eraseFromParent();
2063 Changed = true;
2064 InstToErase = nullptr;
2065 if (!SrcReg || SrcReg.isPhysical())
2066 break;
2067 InstToErase = MRI->getVRegDef(SrcReg);
2068 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2069 break;
2070 }
2071
2072 if (InstToErase && InstToErase->isRegSequence() &&
2073 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2074 InstToErase->eraseFromParent();
2075 Changed = true;
2076 }
2077
2078 if (Changed)
2079 return true;
2080
2081 // Run this after foldInstOperand to avoid turning scalar additions into
2082 // vector additions when the result scalar result could just be folded into
2083 // the user(s).
2084 return OpToFold.isReg() &&
2085 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2086}
2087
2088// Clamp patterns are canonically selected to v_max_* instructions, so only
2089// handle them.
2090const MachineOperand *
2091SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2092 unsigned Op = MI.getOpcode();
2093 switch (Op) {
2094 case AMDGPU::V_MAX_F32_e64:
2095 case AMDGPU::V_MAX_F16_e64:
2096 case AMDGPU::V_MAX_F16_t16_e64:
2097 case AMDGPU::V_MAX_F16_fake16_e64:
2098 case AMDGPU::V_MAX_F64_e64:
2099 case AMDGPU::V_MAX_NUM_F64_e64:
2100 case AMDGPU::V_PK_MAX_F16:
2101 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2102 case AMDGPU::V_PK_MAX_NUM_BF16: {
2103 if (MI.mayRaiseFPException())
2104 return nullptr;
2105
2106 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2107 return nullptr;
2108
2109 // Make sure sources are identical.
2110 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2111 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2112 if (!Src0->isReg() || !Src1->isReg() ||
2113 Src0->getReg() != Src1->getReg() ||
2114 Src0->getSubReg() != Src1->getSubReg() ||
2115 Src0->getSubReg() != AMDGPU::NoSubRegister)
2116 return nullptr;
2117
2118 // Can't fold up if we have modifiers.
2119 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2120 return nullptr;
2121
2122 unsigned Src0Mods
2123 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2124 unsigned Src1Mods
2125 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2126
2127 // Having a 0 op_sel_hi would require swizzling the output in the source
2128 // instruction, which we can't do.
2129 unsigned UnsetMods =
2130 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2132 : 0u;
2133 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2134 return nullptr;
2135 return Src0;
2136 }
2137 default:
2138 return nullptr;
2139 }
2140}
2141
2142// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2143bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2144 const MachineOperand *ClampSrc = isClamp(MI);
2145 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2146 return false;
2147
2148 if (!ClampSrc->getReg().isVirtual())
2149 return false;
2150
2151 // Look through COPY. COPY only observed with True16.
2152 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2153 MachineInstr *Def =
2154 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2155
2156 // The type of clamp must be compatible.
2157 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
2158 return false;
2159
2160 if (Def->mayRaiseFPException())
2161 return false;
2162
2163 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2164 if (!DefClamp)
2165 return false;
2166
2167 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2168
2169 // Clamp is applied after omod, so it is OK if omod is set.
2170 DefClamp->setImm(1);
2171
2172 Register DefReg = Def->getOperand(0).getReg();
2173 Register MIDstReg = MI.getOperand(0).getReg();
2174 if (TRI->isSGPRReg(*MRI, DefReg)) {
2175 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2176 // instruction with a VGPR dst.
2177 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2178 MIDstReg)
2179 .addReg(DefReg);
2180 } else {
2181 MRI->replaceRegWith(MIDstReg, DefReg);
2182 }
2183 MI.eraseFromParent();
2184
2185 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2186 // instruction, so we might as well convert it to the more flexible VOP3-only
2187 // mad/fma form.
2188 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2189 Def->eraseFromParent();
2190
2191 return true;
2192}
2193
2194static int getOModValue(unsigned Opc, int64_t Val) {
2195 switch (Opc) {
2196 case AMDGPU::V_MUL_F64_e64:
2197 case AMDGPU::V_MUL_F64_pseudo_e64: {
2198 switch (Val) {
2199 case 0x3fe0000000000000: // 0.5
2200 return SIOutMods::DIV2;
2201 case 0x4000000000000000: // 2.0
2202 return SIOutMods::MUL2;
2203 case 0x4010000000000000: // 4.0
2204 return SIOutMods::MUL4;
2205 default:
2206 return SIOutMods::NONE;
2207 }
2208 }
2209 case AMDGPU::V_MUL_F32_e64: {
2210 switch (static_cast<uint32_t>(Val)) {
2211 case 0x3f000000: // 0.5
2212 return SIOutMods::DIV2;
2213 case 0x40000000: // 2.0
2214 return SIOutMods::MUL2;
2215 case 0x40800000: // 4.0
2216 return SIOutMods::MUL4;
2217 default:
2218 return SIOutMods::NONE;
2219 }
2220 }
2221 case AMDGPU::V_MUL_F16_e64:
2222 case AMDGPU::V_MUL_F16_t16_e64:
2223 case AMDGPU::V_MUL_F16_fake16_e64: {
2224 switch (static_cast<uint16_t>(Val)) {
2225 case 0x3800: // 0.5
2226 return SIOutMods::DIV2;
2227 case 0x4000: // 2.0
2228 return SIOutMods::MUL2;
2229 case 0x4400: // 4.0
2230 return SIOutMods::MUL4;
2231 default:
2232 return SIOutMods::NONE;
2233 }
2234 }
2235 default:
2236 llvm_unreachable("invalid mul opcode");
2237 }
2238}
2239
2240// FIXME: Does this really not support denormals with f16?
2241// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2242// handled, so will anything other than that break?
2243std::pair<const MachineOperand *, int>
2244SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2245 unsigned Op = MI.getOpcode();
2246 switch (Op) {
2247 case AMDGPU::V_MUL_F64_e64:
2248 case AMDGPU::V_MUL_F64_pseudo_e64:
2249 case AMDGPU::V_MUL_F32_e64:
2250 case AMDGPU::V_MUL_F16_t16_e64:
2251 case AMDGPU::V_MUL_F16_fake16_e64:
2252 case AMDGPU::V_MUL_F16_e64: {
2253 // If output denormals are enabled, omod is ignored.
2254 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2256 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2257 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2258 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2261 MI.mayRaiseFPException())
2262 return std::pair(nullptr, SIOutMods::NONE);
2263
2264 const MachineOperand *RegOp = nullptr;
2265 const MachineOperand *ImmOp = nullptr;
2266 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2267 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2268 if (Src0->isImm()) {
2269 ImmOp = Src0;
2270 RegOp = Src1;
2271 } else if (Src1->isImm()) {
2272 ImmOp = Src1;
2273 RegOp = Src0;
2274 } else
2275 return std::pair(nullptr, SIOutMods::NONE);
2276
2277 int OMod = getOModValue(Op, ImmOp->getImm());
2278 if (OMod == SIOutMods::NONE ||
2279 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2280 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2281 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2282 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2283 return std::pair(nullptr, SIOutMods::NONE);
2284
2285 return std::pair(RegOp, OMod);
2286 }
2287 case AMDGPU::V_ADD_F64_e64:
2288 case AMDGPU::V_ADD_F64_pseudo_e64:
2289 case AMDGPU::V_ADD_F32_e64:
2290 case AMDGPU::V_ADD_F16_e64:
2291 case AMDGPU::V_ADD_F16_t16_e64:
2292 case AMDGPU::V_ADD_F16_fake16_e64: {
2293 // If output denormals are enabled, omod is ignored.
2294 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2296 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2297 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2298 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2300 return std::pair(nullptr, SIOutMods::NONE);
2301
2302 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2303 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2304 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2305
2306 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2307 Src0->getSubReg() == Src1->getSubReg() &&
2308 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2309 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2310 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2311 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2312 return std::pair(Src0, SIOutMods::MUL2);
2313
2314 return std::pair(nullptr, SIOutMods::NONE);
2315 }
2316 default:
2317 return std::pair(nullptr, SIOutMods::NONE);
2318 }
2319}
2320
2321// FIXME: Does this need to check IEEE bit on function?
2322bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2323 const MachineOperand *RegOp;
2324 int OMod;
2325 std::tie(RegOp, OMod) = isOMod(MI);
2326 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2327 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2328 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2329 return false;
2330
2331 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2332 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2333 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2334 return false;
2335
2336 if (Def->mayRaiseFPException())
2337 return false;
2338
2339 // Clamp is applied after omod. If the source already has clamp set, don't
2340 // fold it.
2341 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2342 return false;
2343
2344 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2345
2346 DefOMod->setImm(OMod);
2347 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2348 // Kill flags can be wrong if we replaced a def inside a loop with a def
2349 // outside the loop.
2350 MRI->clearKillFlags(Def->getOperand(0).getReg());
2351 MI.eraseFromParent();
2352
2353 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2354 // instruction, so we might as well convert it to the more flexible VOP3-only
2355 // mad/fma form.
2356 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2357 Def->eraseFromParent();
2358
2359 return true;
2360}
2361
2362// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2363// instruction which can take an agpr. So far that means a store.
2364bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2365 assert(MI.isRegSequence());
2366 auto Reg = MI.getOperand(0).getReg();
2367
2368 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2369 !MRI->hasOneNonDBGUse(Reg))
2370 return false;
2371
2373 if (!getRegSeqInit(Defs, Reg))
2374 return false;
2375
2376 for (auto &[Op, SubIdx] : Defs) {
2377 if (!Op->isReg())
2378 return false;
2379 if (TRI->isAGPR(*MRI, Op->getReg()))
2380 continue;
2381 // Maybe this is a COPY from AREG
2382 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2383 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2384 return false;
2385 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2386 return false;
2387 }
2388
2389 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2390 MachineInstr *UseMI = Op->getParent();
2391 while (UseMI->isCopy() && !Op->getSubReg()) {
2392 Reg = UseMI->getOperand(0).getReg();
2393 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2394 return false;
2395 Op = &*MRI->use_nodbg_begin(Reg);
2396 UseMI = Op->getParent();
2397 }
2398
2399 if (Op->getSubReg())
2400 return false;
2401
2402 unsigned OpIdx = Op - &UseMI->getOperand(0);
2403 const MCInstrDesc &InstDesc = UseMI->getDesc();
2404 const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
2405 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2406 return false;
2407
2408 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2409 auto Dst = MRI->createVirtualRegister(NewDstRC);
2410 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2411 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2412
2413 for (auto &[Def, SubIdx] : Defs) {
2414 Def->setIsKill(false);
2415 if (TRI->isAGPR(*MRI, Def->getReg())) {
2416 RS.add(*Def);
2417 } else { // This is a copy
2418 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2419 SubDef->getOperand(1).setIsKill(false);
2420 RS.addReg(SubDef->getOperand(1).getReg(), {}, Def->getSubReg());
2421 }
2422 RS.addImm(SubIdx);
2423 }
2424
2425 Op->setReg(Dst);
2426 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2427 Op->setReg(Reg);
2428 RS->eraseFromParent();
2429 return false;
2430 }
2431
2432 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2433
2434 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2435 // in which case we can erase them all later in runOnMachineFunction.
2436 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2437 MI.eraseFromParent();
2438 return true;
2439}
2440
2441/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2442/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2443static bool isAGPRCopy(const SIRegisterInfo &TRI,
2444 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2445 Register &OutReg, unsigned &OutSubReg) {
2446 assert(Copy.isCopy());
2447
2448 const MachineOperand &CopySrc = Copy.getOperand(1);
2449 Register CopySrcReg = CopySrc.getReg();
2450 if (!CopySrcReg.isVirtual())
2451 return false;
2452
2453 // Common case: copy from AGPR directly, e.g.
2454 // %1:vgpr_32 = COPY %0:agpr_32
2455 if (TRI.isAGPR(MRI, CopySrcReg)) {
2456 OutReg = CopySrcReg;
2457 OutSubReg = CopySrc.getSubReg();
2458 return true;
2459 }
2460
2461 // Sometimes it can also involve two copies, e.g.
2462 // %1:vgpr_256 = COPY %0:agpr_256
2463 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2464 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2465 if (!CopySrcDef || !CopySrcDef->isCopy())
2466 return false;
2467
2468 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2469 Register OtherCopySrcReg = OtherCopySrc.getReg();
2470 if (!OtherCopySrcReg.isVirtual() ||
2471 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2472 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2473 !TRI.isAGPR(MRI, OtherCopySrcReg))
2474 return false;
2475
2476 OutReg = OtherCopySrcReg;
2477 OutSubReg = CopySrc.getSubReg();
2478 return true;
2479}
2480
2481// Try to hoist an AGPR to VGPR copy across a PHI.
2482// This should allow folding of an AGPR into a consumer which may support it.
2483//
2484// Example 1: LCSSA PHI
2485// loop:
2486// %1:vreg = COPY %0:areg
2487// exit:
2488// %2:vreg = PHI %1:vreg, %loop
2489// =>
2490// loop:
2491// exit:
2492// %1:areg = PHI %0:areg, %loop
2493// %2:vreg = COPY %1:areg
2494//
2495// Example 2: PHI with multiple incoming values:
2496// entry:
2497// %1:vreg = GLOBAL_LOAD(..)
2498// loop:
2499// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2500// %3:areg = COPY %2:vreg
2501// %4:areg = (instr using %3:areg)
2502// %5:vreg = COPY %4:areg
2503// =>
2504// entry:
2505// %1:vreg = GLOBAL_LOAD(..)
2506// %2:areg = COPY %1:vreg
2507// loop:
2508// %3:areg = PHI %2:areg, %entry, %X:areg,
2509// %4:areg = (instr using %3:areg)
2510bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2511 assert(PHI.isPHI());
2512
2513 Register PhiOut = PHI.getOperand(0).getReg();
2514 if (!TRI->isVGPR(*MRI, PhiOut))
2515 return false;
2516
2517 // Iterate once over all incoming values of the PHI to check if this PHI is
2518 // eligible, and determine the exact AGPR RC we'll target.
2519 const TargetRegisterClass *ARC = nullptr;
2520 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2521 MachineOperand &MO = PHI.getOperand(K);
2522 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2523 if (!Copy || !Copy->isCopy())
2524 continue;
2525
2526 Register AGPRSrc;
2527 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2528 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2529 continue;
2530
2531 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2532 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2533 CopyInRC = SubRC;
2534
2535 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2536 return false;
2537 ARC = CopyInRC;
2538 }
2539
2540 if (!ARC)
2541 return false;
2542
2543 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2544
2545 // Rewrite the PHI's incoming values to ARC.
2546 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2547 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2548 MachineOperand &MO = PHI.getOperand(K);
2549 Register Reg = MO.getReg();
2550
2552 MachineBasicBlock *InsertMBB = nullptr;
2553
2554 // Look at the def of Reg, ignoring all copies.
2555 unsigned CopyOpc = AMDGPU::COPY;
2556 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2557
2558 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2559 // the copy was single-use, it will be removed by DCE later.
2560 if (Def->isCopy()) {
2561 Register AGPRSrc;
2562 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2563 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2564 MO.setReg(AGPRSrc);
2565 MO.setSubReg(AGPRSubReg);
2566 continue;
2567 }
2568
2569 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2570 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2571 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2572 // is unlikely to be profitable.
2573 //
2574 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2575 MachineOperand &CopyIn = Def->getOperand(1);
2576 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2577 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2578 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2579 }
2580
2581 InsertMBB = Def->getParent();
2582 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2583 } else {
2584 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2585 InsertPt = InsertMBB->getFirstTerminator();
2586 }
2587
2588 Register NewReg = MRI->createVirtualRegister(ARC);
2589 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2590 TII->get(CopyOpc), NewReg)
2591 .addReg(Reg);
2592 MO.setReg(NewReg);
2593
2594 (void)MI;
2595 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2596 }
2597
2598 // Replace the PHI's result with a new register.
2599 Register NewReg = MRI->createVirtualRegister(ARC);
2600 PHI.getOperand(0).setReg(NewReg);
2601
2602 // COPY that new register back to the original PhiOut register. This COPY will
2603 // usually be folded out later.
2604 MachineBasicBlock *MBB = PHI.getParent();
2605 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2606 TII->get(AMDGPU::COPY), PhiOut)
2607 .addReg(NewReg);
2608
2609 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2610 return true;
2611}
2612
2613// Attempt to convert VGPR load to an AGPR load.
2614bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2615 assert(MI.mayLoad());
2616 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2617 return false;
2618
2619 MachineOperand &Def = MI.getOperand(0);
2620 if (!Def.isDef())
2621 return false;
2622
2623 Register DefReg = Def.getReg();
2624
2625 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2626 return false;
2627
2629 llvm::make_pointer_range(MRI->use_nodbg_instructions(DefReg)));
2630 SmallVector<Register, 8> MoveRegs;
2631
2632 if (Users.empty())
2633 return false;
2634
2635 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2636 while (!Users.empty()) {
2637 const MachineInstr *I = Users.pop_back_val();
2638 if (!I->isCopy() && !I->isRegSequence())
2639 return false;
2640 Register DstReg = I->getOperand(0).getReg();
2641 // Physical registers may have more than one instruction definitions
2642 if (DstReg.isPhysical())
2643 return false;
2644 if (TRI->isAGPR(*MRI, DstReg))
2645 continue;
2646 MoveRegs.push_back(DstReg);
2647 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2648 Users.push_back(&U);
2649 }
2650
2651 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2652 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2653 if (!TII->isOperandLegal(MI, 0, &Def)) {
2654 MRI->setRegClass(DefReg, RC);
2655 return false;
2656 }
2657
2658 while (!MoveRegs.empty()) {
2659 Register Reg = MoveRegs.pop_back_val();
2660 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2661 }
2662
2663 LLVM_DEBUG(dbgs() << "Folded " << MI);
2664
2665 return true;
2666}
2667
2668// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2669// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2670// there's cases where it can create a lot more AGPR-AGPR copies, which are
2671// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2672//
2673// This function looks at all AGPR PHIs in a basic block and collects their
2674// operands. Then, it checks for register that are used more than once across
2675// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2676// having to create one VGPR temporary per use, which can get very messy if
2677// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2678// element).
2679//
2680// Example
2681// a:
2682// %in:agpr_256 = COPY %foo:vgpr_256
2683// c:
2684// %x:agpr_32 = ..
2685// b:
2686// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2687// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2688// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2689// =>
2690// a:
2691// %in:agpr_256 = COPY %foo:vgpr_256
2692// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2693// %tmp_agpr:agpr_32 = COPY %tmp
2694// c:
2695// %x:agpr_32 = ..
2696// b:
2697// %0:areg = PHI %tmp_agpr, %a, %x, %c
2698// %1:areg = PHI %tmp_agpr, %a, %y, %c
2699// %2:areg = PHI %tmp_agpr, %a, %z, %c
2700bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2701 // This is only really needed on GFX908 where AGPR-AGPR copies are
2702 // unreasonably difficult.
2703 if (ST->hasGFX90AInsts())
2704 return false;
2705
2706 // Look at all AGPR Phis and collect the register + subregister used.
2707 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2708 RegToMO;
2709
2710 for (auto &MI : MBB) {
2711 if (!MI.isPHI())
2712 break;
2713
2714 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2715 continue;
2716
2717 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2718 MachineOperand &PhiMO = MI.getOperand(K);
2719 if (!PhiMO.getSubReg())
2720 continue;
2721 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2722 }
2723 }
2724
2725 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2726 // a VGPR.
2727 bool Changed = false;
2728 for (const auto &[Entry, MOs] : RegToMO) {
2729 if (MOs.size() == 1)
2730 continue;
2731
2732 const auto [Reg, SubReg] = Entry;
2733 MachineInstr *Def = MRI->getVRegDef(Reg);
2734 MachineBasicBlock *DefMBB = Def->getParent();
2735
2736 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2737 // out.
2738 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2739 Register TempVGPR =
2740 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2741 MachineInstr *VGPRCopy =
2742 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2743 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2744 .addReg(Reg, /* flags */ {}, SubReg);
2745
2746 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2747 Register TempAGPR = MRI->createVirtualRegister(ARC);
2748 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2749 TII->get(AMDGPU::COPY), TempAGPR)
2750 .addReg(TempVGPR);
2751
2752 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2753 for (MachineOperand *MO : MOs) {
2754 MO->setReg(TempAGPR);
2755 MO->setSubReg(AMDGPU::NoSubRegister);
2756 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2757 }
2758
2759 Changed = true;
2760 }
2761
2762 return Changed;
2763}
2764
2765bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2766 this->MF = &MF;
2767 MRI = &MF.getRegInfo();
2768 ST = &MF.getSubtarget<GCNSubtarget>();
2769 TII = ST->getInstrInfo();
2770 TRI = &TII->getRegisterInfo();
2771 MFI = MF.getInfo<SIMachineFunctionInfo>();
2772
2773 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2774 // correctly handle signed zeros.
2775 //
2776 // FIXME: Also need to check strictfp
2777 bool IsIEEEMode = MFI->getMode().IEEE;
2778
2779 bool Changed = false;
2780 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2781 MachineOperand *CurrentKnownM0Val = nullptr;
2782 for (auto &MI : make_early_inc_range(*MBB)) {
2783 Changed |= tryFoldCndMask(MI);
2784
2785 if (tryFoldZeroHighBits(MI)) {
2786 Changed = true;
2787 continue;
2788 }
2789
2790 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2791 Changed = true;
2792 continue;
2793 }
2794
2795 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2796 Changed = true;
2797 continue;
2798 }
2799
2800 if (MI.mayLoad() && tryFoldLoad(MI)) {
2801 Changed = true;
2802 continue;
2803 }
2804
2805 if (TII->isFoldableCopy(MI)) {
2806 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2807 continue;
2808 }
2809
2810 // Saw an unknown clobber of m0, so we no longer know what it is.
2811 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2812 CurrentKnownM0Val = nullptr;
2813
2814 // TODO: Omod might be OK if there is NSZ only on the source
2815 // instruction, and not the omod multiply.
2816 if (IsIEEEMode || !MI.getFlag(MachineInstr::FmNsz) || !tryFoldOMod(MI))
2817 Changed |= tryFoldClamp(MI);
2818 }
2819
2820 Changed |= tryOptimizeAGPRPhis(*MBB);
2821 }
2822
2823 return Changed;
2824}
2825
2828 MFPropsModifier _(*this, MF);
2829
2830 bool Changed = SIFoldOperandsImpl().run(MF);
2831 if (!Changed) {
2832 return PreservedAnalyses::all();
2833 }
2835 PA.preserveSet<CFGAnalyses>();
2836 return PA;
2837}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(const FoldableDef &OpToFold)
static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
const HexagonRegisterInfo & getRegisterInfo() const
ArrayRef< MCOperandInfo > operands() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void clearFlag(MIFlag Flag)
clearFlag - Clear a MI flag.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:210
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:226
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:212
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:220
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:227
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:223
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:225
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:215
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:240
LLVM_READONLY int32_t getFlatScratchInstSSfromSV(uint32_t Opcode)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.