LLVM 20.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "AMDGPU.h"
12#include "GCNSubtarget.h"
18
19#define DEBUG_TYPE "si-fold-operands"
20using namespace llvm;
21
22namespace {
23
24struct FoldCandidate {
26 union {
27 MachineOperand *OpToFold;
28 uint64_t ImmToFold;
29 int FrameIndexToFold;
30 };
31 int ShrinkOpcode;
32 unsigned UseOpNo;
34 bool Commuted;
35
36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37 bool Commuted_ = false,
38 int ShrinkOp = -1) :
39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40 Kind(FoldOp->getType()),
41 Commuted(Commuted_) {
42 if (FoldOp->isImm()) {
43 ImmToFold = FoldOp->getImm();
44 } else if (FoldOp->isFI()) {
45 FrameIndexToFold = FoldOp->getIndex();
46 } else {
47 assert(FoldOp->isReg() || FoldOp->isGlobal());
48 OpToFold = FoldOp;
49 }
50 }
51
52 bool isFI() const {
53 return Kind == MachineOperand::MO_FrameIndex;
54 }
55
56 bool isImm() const {
57 return Kind == MachineOperand::MO_Immediate;
58 }
59
60 bool isReg() const {
61 return Kind == MachineOperand::MO_Register;
62 }
63
64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65
66 bool needsShrink() const { return ShrinkOpcode != -1; }
67};
68
69class SIFoldOperands : public MachineFunctionPass {
70public:
71 static char ID;
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 bool updateOperand(FoldCandidate &Fold) const;
82
83 bool canUseImmWithOpSel(FoldCandidate &Fold) const;
84
85 bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
86
87 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
88 MachineInstr *MI, unsigned OpNo,
89 MachineOperand *OpToFold) const;
90 bool isUseSafeToFold(const MachineInstr &MI,
91 const MachineOperand &UseMO) const;
92 bool
93 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
94 Register UseReg, uint8_t OpTy) const;
95 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
96 unsigned UseOpIdx,
97 SmallVectorImpl<FoldCandidate> &FoldList) const;
98 void foldOperand(MachineOperand &OpToFold,
100 int UseOpIdx,
102 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
103
104 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
105 bool tryConstantFoldOp(MachineInstr *MI) const;
106 bool tryFoldCndMask(MachineInstr &MI) const;
107 bool tryFoldZeroHighBits(MachineInstr &MI) const;
108 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
109 bool tryFoldFoldableCopy(MachineInstr &MI,
110 MachineOperand *&CurrentKnownM0Val) const;
111
112 const MachineOperand *isClamp(const MachineInstr &MI) const;
113 bool tryFoldClamp(MachineInstr &MI);
114
115 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
116 bool tryFoldOMod(MachineInstr &MI);
117 bool tryFoldRegSequence(MachineInstr &MI);
118 bool tryFoldPhiAGPR(MachineInstr &MI);
119 bool tryFoldLoad(MachineInstr &MI);
120
121 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
122
123public:
124 SIFoldOperands() : MachineFunctionPass(ID) {
126 }
127
128 bool runOnMachineFunction(MachineFunction &MF) override;
129
130 StringRef getPassName() const override { return "SI Fold Operands"; }
131
132 void getAnalysisUsage(AnalysisUsage &AU) const override {
133 AU.setPreservesCFG();
135 }
136};
137
138} // End anonymous namespace.
139
140INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
141 "SI Fold Operands", false, false)
142
143char SIFoldOperands::ID = 0;
144
145char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
146
149 const MachineOperand &MO) {
150 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
151 if (const TargetRegisterClass *SubRC =
152 TRI.getSubRegisterClass(RC, MO.getSubReg()))
153 RC = SubRC;
154 return RC;
155}
156
157// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
158static unsigned macToMad(unsigned Opc) {
159 switch (Opc) {
160 case AMDGPU::V_MAC_F32_e64:
161 return AMDGPU::V_MAD_F32_e64;
162 case AMDGPU::V_MAC_F16_e64:
163 return AMDGPU::V_MAD_F16_e64;
164 case AMDGPU::V_FMAC_F32_e64:
165 return AMDGPU::V_FMA_F32_e64;
166 case AMDGPU::V_FMAC_F16_e64:
167 return AMDGPU::V_FMA_F16_gfx9_e64;
168 case AMDGPU::V_FMAC_F16_t16_e64:
169 return AMDGPU::V_FMA_F16_gfx9_e64;
170 case AMDGPU::V_FMAC_LEGACY_F32_e64:
171 return AMDGPU::V_FMA_LEGACY_F32_e64;
172 case AMDGPU::V_FMAC_F64_e64:
173 return AMDGPU::V_FMA_F64_e64;
174 }
175 return AMDGPU::INSTRUCTION_LIST_END;
176}
177
178// TODO: Add heuristic that the frame index might not fit in the addressing mode
179// immediate offset to avoid materializing in loops.
180bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
181 const MachineOperand &OpToFold) const {
182 if (!OpToFold.isFI())
183 return false;
184
185 const unsigned Opc = UseMI.getOpcode();
186 if (TII->isMUBUF(UseMI))
187 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
188 if (!TII->isFLATScratch(UseMI))
189 return false;
190
191 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
192 if (OpNo == SIdx)
193 return true;
194
195 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
196 return OpNo == VIdx && SIdx == -1;
197}
198
200 return new SIFoldOperands();
201}
202
203bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
204 MachineInstr *MI = Fold.UseMI;
205 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
206 const uint64_t TSFlags = MI->getDesc().TSFlags;
207
208 assert(Old.isReg() && Fold.isImm());
209
210 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
211 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
212 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
213 return false;
214
215 unsigned Opcode = MI->getOpcode();
216 int OpNo = MI->getOperandNo(&Old);
217 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
218 switch (OpType) {
219 default:
220 return false;
227 break;
228 }
229
230 return true;
231}
232
233bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
234 MachineInstr *MI = Fold.UseMI;
235 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
236 unsigned Opcode = MI->getOpcode();
237 int OpNo = MI->getOperandNo(&Old);
238 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
239
240 // If the literal can be inlined as-is, apply it and short-circuit the
241 // tests below. The main motivation for this is to avoid unintuitive
242 // uses of opsel.
243 if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
244 Old.ChangeToImmediate(Fold.ImmToFold);
245 return true;
246 }
247
248 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
249 // op_sel in a way that allows an inline constant.
250 int ModIdx = -1;
251 unsigned SrcIdx = ~0;
252 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
253 ModIdx = AMDGPU::OpName::src0_modifiers;
254 SrcIdx = 0;
255 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
256 ModIdx = AMDGPU::OpName::src1_modifiers;
257 SrcIdx = 1;
258 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
259 ModIdx = AMDGPU::OpName::src2_modifiers;
260 SrcIdx = 2;
261 }
262 assert(ModIdx != -1);
263 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
264 MachineOperand &Mod = MI->getOperand(ModIdx);
265 unsigned ModVal = Mod.getImm();
266
267 uint16_t ImmLo = static_cast<uint16_t>(
268 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
269 uint16_t ImmHi = static_cast<uint16_t>(
270 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
271 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
272 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
273
274 // Helper function that attempts to inline the given value with a newly
275 // chosen opsel pattern.
276 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
277 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
278 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
279 Old.ChangeToImmediate(Imm);
280 return true;
281 }
282
283 // Try to shuffle the halves around and leverage opsel to get an inline
284 // constant.
285 uint16_t Lo = static_cast<uint16_t>(Imm);
286 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
287 if (Lo == Hi) {
288 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
289 Mod.setImm(NewModVal);
291 return true;
292 }
293
294 if (static_cast<int16_t>(Lo) < 0) {
295 int32_t SExt = static_cast<int16_t>(Lo);
296 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
297 Mod.setImm(NewModVal);
298 Old.ChangeToImmediate(SExt);
299 return true;
300 }
301 }
302
303 // This check is only useful for integer instructions
304 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
306 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
307 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
308 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
309 return true;
310 }
311 }
312 } else {
313 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
314 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
315 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
316 Old.ChangeToImmediate(Swapped);
317 return true;
318 }
319 }
320
321 return false;
322 };
323
324 if (tryFoldToInline(Imm))
325 return true;
326
327 // Replace integer addition by subtraction and vice versa if it allows
328 // folding the immediate to an inline constant.
329 //
330 // We should only ever get here for SrcIdx == 1 due to canonicalization
331 // earlier in the pipeline, but we double-check here to be safe / fully
332 // general.
333 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
334 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
335 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
336 unsigned ClampIdx =
337 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
338 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
339
340 if (!Clamp) {
341 uint16_t NegLo = -static_cast<uint16_t>(Imm);
342 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
343 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
344
345 if (tryFoldToInline(NegImm)) {
346 unsigned NegOpcode =
347 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
348 MI->setDesc(TII->get(NegOpcode));
349 return true;
350 }
351 }
352 }
353
354 return false;
355}
356
357bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
358 MachineInstr *MI = Fold.UseMI;
359 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
360 assert(Old.isReg());
361
362 if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
363 if (tryFoldImmWithOpSel(Fold))
364 return true;
365
366 // We can't represent the candidate as an inline constant. Try as a literal
367 // with the original opsel, checking constant bus limitations.
369 int OpNo = MI->getOperandNo(&Old);
370 if (!TII->isOperandLegal(*MI, OpNo, &New))
371 return false;
372 Old.ChangeToImmediate(Fold.ImmToFold);
373 return true;
374 }
375
376 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
377 MachineBasicBlock *MBB = MI->getParent();
378 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
379 if (Liveness != MachineBasicBlock::LQR_Dead) {
380 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
381 return false;
382 }
383
384 int Op32 = Fold.ShrinkOpcode;
385 MachineOperand &Dst0 = MI->getOperand(0);
386 MachineOperand &Dst1 = MI->getOperand(1);
387 assert(Dst0.isDef() && Dst1.isDef());
388
389 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
390
391 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
392 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
393
394 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
395
396 if (HaveNonDbgCarryUse) {
397 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
398 Dst1.getReg())
399 .addReg(AMDGPU::VCC, RegState::Kill);
400 }
401
402 // Keep the old instruction around to avoid breaking iterators, but
403 // replace it with a dummy instruction to remove uses.
404 //
405 // FIXME: We should not invert how this pass looks at operands to avoid
406 // this. Should track set of foldable movs instead of looking for uses
407 // when looking at a use.
408 Dst0.setReg(NewReg0);
409 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
410 MI->removeOperand(I);
411 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
412
413 if (Fold.Commuted)
414 TII->commuteInstruction(*Inst32, false);
415 return true;
416 }
417
418 assert(!Fold.needsShrink() && "not handled");
419
420 if (Fold.isImm()) {
421 if (Old.isTied()) {
422 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
423 if (NewMFMAOpc == -1)
424 return false;
425 MI->setDesc(TII->get(NewMFMAOpc));
426 MI->untieRegOperand(0);
427 }
428 Old.ChangeToImmediate(Fold.ImmToFold);
429 return true;
430 }
431
432 if (Fold.isGlobal()) {
433 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
434 Fold.OpToFold->getTargetFlags());
435 return true;
436 }
437
438 if (Fold.isFI()) {
439 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
440 return true;
441 }
442
443 MachineOperand *New = Fold.OpToFold;
444 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
445 Old.setIsUndef(New->isUndef());
446 return true;
447}
448
450 const MachineInstr *MI) {
451 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
452}
453
455 MachineInstr *MI, unsigned OpNo,
456 MachineOperand *FoldOp, bool Commuted = false,
457 int ShrinkOp = -1) {
458 // Skip additional folding on the same operand.
459 for (FoldCandidate &Fold : FoldList)
460 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
461 return;
462 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
463 << " operand " << OpNo << "\n " << *MI);
464 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
465}
466
467bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
468 MachineInstr *MI, unsigned OpNo,
469 MachineOperand *OpToFold) const {
470 const unsigned Opc = MI->getOpcode();
471
472 auto tryToFoldAsFMAAKorMK = [&]() {
473 if (!OpToFold->isImm())
474 return false;
475
476 const bool TryAK = OpNo == 3;
477 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
478 MI->setDesc(TII->get(NewOpc));
479
480 // We have to fold into operand which would be Imm not into OpNo.
481 bool FoldAsFMAAKorMK =
482 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
483 if (FoldAsFMAAKorMK) {
484 // Untie Src2 of fmac.
485 MI->untieRegOperand(3);
486 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
487 if (OpNo == 1) {
488 MachineOperand &Op1 = MI->getOperand(1);
489 MachineOperand &Op2 = MI->getOperand(2);
490 Register OldReg = Op1.getReg();
491 // Operand 2 might be an inlinable constant
492 if (Op2.isImm()) {
493 Op1.ChangeToImmediate(Op2.getImm());
494 Op2.ChangeToRegister(OldReg, false);
495 } else {
496 Op1.setReg(Op2.getReg());
497 Op2.setReg(OldReg);
498 }
499 }
500 return true;
501 }
502 MI->setDesc(TII->get(Opc));
503 return false;
504 };
505
506 bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
507 if (!IsLegal && OpToFold->isImm()) {
508 FoldCandidate Fold(MI, OpNo, OpToFold);
509 IsLegal = canUseImmWithOpSel(Fold);
510 }
511
512 if (!IsLegal) {
513 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
514 unsigned NewOpc = macToMad(Opc);
515 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
516 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
517 // to fold the operand.
518 MI->setDesc(TII->get(NewOpc));
519 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
520 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
521 if (AddOpSel)
522 MI->addOperand(MachineOperand::CreateImm(0));
523 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
524 if (FoldAsMAD) {
525 MI->untieRegOperand(OpNo);
526 return true;
527 }
528 if (AddOpSel)
529 MI->removeOperand(MI->getNumExplicitOperands() - 1);
530 MI->setDesc(TII->get(Opc));
531 }
532
533 // Special case for s_fmac_f32 if we are trying to fold into Src2.
534 // By transforming into fmaak we can untie Src2 and make folding legal.
535 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
536 if (tryToFoldAsFMAAKorMK())
537 return true;
538 }
539
540 // Special case for s_setreg_b32
541 if (OpToFold->isImm()) {
542 unsigned ImmOpc = 0;
543 if (Opc == AMDGPU::S_SETREG_B32)
544 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
545 else if (Opc == AMDGPU::S_SETREG_B32_mode)
546 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
547 if (ImmOpc) {
548 MI->setDesc(TII->get(ImmOpc));
549 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
550 return true;
551 }
552 }
553
554 // If we are already folding into another operand of MI, then
555 // we can't commute the instruction, otherwise we risk making the
556 // other fold illegal.
557 if (isUseMIInFoldList(FoldList, MI))
558 return false;
559
560 // Operand is not legal, so try to commute the instruction to
561 // see if this makes it possible to fold.
562 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
563 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
564 if (!CanCommute)
565 return false;
566
567 // One of operands might be an Imm operand, and OpNo may refer to it after
568 // the call of commuteInstruction() below. Such situations are avoided
569 // here explicitly as OpNo must be a register operand to be a candidate
570 // for memory folding.
571 if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
572 return false;
573
574 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
575 return false;
576
577 int Op32 = -1;
578 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
579 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
580 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
581 (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
582 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
583 return false;
584 }
585
586 // Verify the other operand is a VGPR, otherwise we would violate the
587 // constant bus restriction.
588 MachineOperand &OtherOp = MI->getOperand(OpNo);
589 if (!OtherOp.isReg() ||
590 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
591 return false;
592
593 assert(MI->getOperand(1).isDef());
594
595 // Make sure to get the 32-bit version of the commuted opcode.
596 unsigned MaybeCommutedOpc = MI->getOpcode();
597 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
598 }
599
600 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
601 return true;
602 }
603
604 // Inlineable constant might have been folded into Imm operand of fmaak or
605 // fmamk and we are trying to fold a non-inlinable constant.
606 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
607 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
608 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
609 MachineOperand &OpImm = MI->getOperand(ImmIdx);
610 if (!OpImm.isReg() &&
611 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
612 return tryToFoldAsFMAAKorMK();
613 }
614
615 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
616 // By changing into fmamk we can untie Src2.
617 // If folding for Src0 happens first and it is identical operand to Src1 we
618 // should avoid transforming into fmamk which requires commuting as it would
619 // cause folding into Src1 to fail later on due to wrong OpNo used.
620 if (Opc == AMDGPU::S_FMAC_F32 &&
621 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
622 if (tryToFoldAsFMAAKorMK())
623 return true;
624 }
625
626 // Check the case where we might introduce a second constant operand to a
627 // scalar instruction
628 if (TII->isSALU(MI->getOpcode())) {
629 const MCInstrDesc &InstDesc = MI->getDesc();
630 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
631
632 // Fine if the operand can be encoded as an inline constant
633 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
634 // Otherwise check for another constant
635 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
636 auto &Op = MI->getOperand(i);
637 if (OpNo != i && !Op.isReg() &&
638 !TII->isInlineConstant(Op, InstDesc.operands()[i]))
639 return false;
640 }
641 }
642 }
643
644 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
645 return true;
646}
647
648bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
649 const MachineOperand &UseMO) const {
650 // Operands of SDWA instructions must be registers.
651 return !TII->isSDWA(MI);
652}
653
654// Find a def of the UseReg, check if it is a reg_sequence and find initializers
655// for each subreg, tracking it to foldable inline immediate if possible.
656// Returns true on success.
657bool SIFoldOperands::getRegSeqInit(
658 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
659 Register UseReg, uint8_t OpTy) const {
660 MachineInstr *Def = MRI->getVRegDef(UseReg);
661 if (!Def || !Def->isRegSequence())
662 return false;
663
664 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
665 MachineOperand *Sub = &Def->getOperand(I);
666 assert(Sub->isReg());
667
668 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
669 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
670 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
671 SubDef = MRI->getVRegDef(Sub->getReg())) {
672 MachineOperand *Op = &SubDef->getOperand(1);
673 if (Op->isImm()) {
674 if (TII->isInlineConstant(*Op, OpTy))
675 Sub = Op;
676 break;
677 }
678 if (!Op->isReg() || Op->getReg().isPhysical())
679 break;
680 Sub = Op;
681 }
682
683 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
684 }
685
686 return true;
687}
688
689bool SIFoldOperands::tryToFoldACImm(
690 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
691 SmallVectorImpl<FoldCandidate> &FoldList) const {
692 const MCInstrDesc &Desc = UseMI->getDesc();
693 if (UseOpIdx >= Desc.getNumOperands())
694 return false;
695
697 return false;
698
699 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
700 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
701 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
702 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
703 return true;
704 }
705
706 if (!OpToFold.isReg())
707 return false;
708
709 Register UseReg = OpToFold.getReg();
710 if (!UseReg.isVirtual())
711 return false;
712
713 if (isUseMIInFoldList(FoldList, UseMI))
714 return false;
715
716 // Maybe it is just a COPY of an immediate itself.
717 MachineInstr *Def = MRI->getVRegDef(UseReg);
718 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
719 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
720 MachineOperand &DefOp = Def->getOperand(1);
721 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
722 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
723 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
724 return true;
725 }
726 }
727
729 if (!getRegSeqInit(Defs, UseReg, OpTy))
730 return false;
731
732 int32_t Imm;
733 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
734 const MachineOperand *Op = Defs[I].first;
735 if (!Op->isImm())
736 return false;
737
738 auto SubImm = Op->getImm();
739 if (!I) {
740 Imm = SubImm;
741 if (!TII->isInlineConstant(*Op, OpTy) ||
742 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
743 return false;
744
745 continue;
746 }
747 if (Imm != SubImm)
748 return false; // Can only fold splat constants
749 }
750
751 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
752 return true;
753}
754
755void SIFoldOperands::foldOperand(
756 MachineOperand &OpToFold,
758 int UseOpIdx,
760 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
761 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
762
763 if (!isUseSafeToFold(*UseMI, *UseOp))
764 return;
765
766 // FIXME: Fold operands with subregs.
767 if (UseOp->isReg() && OpToFold.isReg() &&
768 (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
769 return;
770
771 // Special case for REG_SEQUENCE: We can't fold literals into
772 // REG_SEQUENCE instructions, so we have to fold them into the
773 // uses of REG_SEQUENCE.
774 if (UseMI->isRegSequence()) {
775 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
776 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
777
778 // Grab the use operands first
780 for (auto &Use : MRI->use_nodbg_operands(RegSeqDstReg))
781 UsesToProcess.push_back(&Use);
782 for (auto *RSUse : UsesToProcess) {
783 MachineInstr *RSUseMI = RSUse->getParent();
784
785 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
786 RSUseMI->getOperandNo(RSUse), FoldList))
787 continue;
788
789 if (RSUse->getSubReg() != RegSeqDstSubReg)
790 continue;
791
792 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
793 CopiesToReplace);
794 }
795 return;
796 }
797
798 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
799 return;
800
801 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
802 // Verify that this is a stack access.
803 // FIXME: Should probably use stack pseudos before frame lowering.
804
805 if (TII->isMUBUF(*UseMI)) {
806 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
807 MFI->getScratchRSrcReg())
808 return;
809
810 // Ensure this is either relative to the current frame or the current
811 // wave.
812 MachineOperand &SOff =
813 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
814 if (!SOff.isImm() || SOff.getImm() != 0)
815 return;
816 }
817
818 // A frame index will resolve to a positive constant, so it should always be
819 // safe to fold the addressing mode, even pre-GFX9.
820 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
821
822 const unsigned Opc = UseMI->getOpcode();
823 if (TII->isFLATScratch(*UseMI) &&
824 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
825 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
826 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
827 UseMI->setDesc(TII->get(NewOpc));
828 }
829
830 return;
831 }
832
833 bool FoldingImmLike =
834 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
835
836 if (FoldingImmLike && UseMI->isCopy()) {
837 Register DestReg = UseMI->getOperand(0).getReg();
838 Register SrcReg = UseMI->getOperand(1).getReg();
839 assert(SrcReg.isVirtual());
840
841 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
842
843 // Don't fold into a copy to a physical register with the same class. Doing
844 // so would interfere with the register coalescer's logic which would avoid
845 // redundant initializations.
846 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
847 return;
848
849 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
850 if (!DestReg.isPhysical()) {
851 if (DestRC == &AMDGPU::AGPR_32RegClass &&
852 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
853 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
855 CopiesToReplace.push_back(UseMI);
856 return;
857 }
858 }
859
860 // In order to fold immediates into copies, we need to change the
861 // copy to a MOV.
862
863 unsigned MovOp = TII->getMovOpcode(DestRC);
864 if (MovOp == AMDGPU::COPY)
865 return;
866
869 while (ImpOpI != ImpOpE) {
870 MachineInstr::mop_iterator Tmp = ImpOpI;
871 ImpOpI++;
873 }
874 UseMI->setDesc(TII->get(MovOp));
875
876 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
877 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
878 MachineOperand NewSrcOp(SrcOp);
881 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
882 UseMI->addOperand(NewSrcOp); // src0
883 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
884 UseOpIdx = 2;
885 UseOp = &UseMI->getOperand(UseOpIdx);
886 }
887 CopiesToReplace.push_back(UseMI);
888 } else {
889 if (UseMI->isCopy() && OpToFold.isReg() &&
891 !UseMI->getOperand(1).getSubReg()) {
892 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
893 unsigned Size = TII->getOpSize(*UseMI, 1);
894 Register UseReg = OpToFold.getReg();
896 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
897 UseMI->getOperand(1).setIsKill(false);
898 CopiesToReplace.push_back(UseMI);
899 OpToFold.setIsKill(false);
900
901 // Remove kill flags as kills may now be out of order with uses.
902 MRI->clearKillFlags(OpToFold.getReg());
903
904 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
905 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
906 // its initializers right here, so we will rematerialize immediates and
907 // avoid copies via different reg classes.
909 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
910 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
911 const DebugLoc &DL = UseMI->getDebugLoc();
913
914 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
915 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
917
921 for (unsigned I = 0; I < Size / 4; ++I) {
922 MachineOperand *Def = Defs[I].first;
924 if (Def->isImm() &&
925 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
926 int64_t Imm = Def->getImm();
927
928 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
930 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
931 B.addReg(Tmp);
932 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
933 auto Src = getRegSubRegPair(*Def);
934 Def->setIsKill(false);
935 if (!SeenAGPRs.insert(Src)) {
936 // We cannot build a reg_sequence out of the same registers, they
937 // must be copied. Better do it here before copyPhysReg() created
938 // several reads to do the AGPR->VGPR->AGPR copy.
939 CopyToVGPR = Src;
940 } else {
941 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
942 Src.SubReg);
943 }
944 } else {
945 assert(Def->isReg());
946 Def->setIsKill(false);
947 auto Src = getRegSubRegPair(*Def);
948
949 // Direct copy from SGPR to AGPR is not possible. To avoid creation
950 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
951 // create a copy here and track if we already have such a copy.
952 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
953 CopyToVGPR = Src;
954 } else {
955 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
956 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
957 B.addReg(Tmp);
958 }
959 }
960
961 if (CopyToVGPR.Reg) {
962 Register Vgpr;
963 if (VGPRCopies.count(CopyToVGPR)) {
964 Vgpr = VGPRCopies[CopyToVGPR];
965 } else {
966 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
967 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
968 VGPRCopies[CopyToVGPR] = Vgpr;
969 }
970 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
972 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
973 B.addReg(Tmp);
974 }
975
976 B.addImm(Defs[I].second);
977 }
978 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
979 return;
980 }
981
982 if (Size != 4)
983 return;
984
985 Register Reg0 = UseMI->getOperand(0).getReg();
986 Register Reg1 = UseMI->getOperand(1).getReg();
987 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
988 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
989 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
990 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
991 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
992 TRI->isAGPR(*MRI, Reg1))
993 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
994 return;
995 }
996
997 unsigned UseOpc = UseMI->getOpcode();
998 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
999 (UseOpc == AMDGPU::V_READLANE_B32 &&
1000 (int)UseOpIdx ==
1001 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1002 // %vgpr = V_MOV_B32 imm
1003 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1004 // =>
1005 // %sgpr = S_MOV_B32 imm
1006 if (FoldingImmLike) {
1008 UseMI->getOperand(UseOpIdx).getReg(),
1009 *OpToFold.getParent(),
1010 *UseMI))
1011 return;
1012
1013 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1014
1015 if (OpToFold.isImm())
1016 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1017 else
1019 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1020 return;
1021 }
1022
1023 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1025 UseMI->getOperand(UseOpIdx).getReg(),
1026 *OpToFold.getParent(),
1027 *UseMI))
1028 return;
1029
1030 // %vgpr = COPY %sgpr0
1031 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1032 // =>
1033 // %sgpr1 = COPY %sgpr0
1034 UseMI->setDesc(TII->get(AMDGPU::COPY));
1035 UseMI->getOperand(1).setReg(OpToFold.getReg());
1036 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1037 UseMI->getOperand(1).setIsKill(false);
1038 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1039 return;
1040 }
1041 }
1042
1043 const MCInstrDesc &UseDesc = UseMI->getDesc();
1044
1045 // Don't fold into target independent nodes. Target independent opcodes
1046 // don't have defined register classes.
1047 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1048 UseDesc.operands()[UseOpIdx].RegClass == -1)
1049 return;
1050 }
1051
1052 if (!FoldingImmLike) {
1053 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1054 // Don't fold if OpToFold doesn't hold an aligned register.
1055 const TargetRegisterClass *RC =
1056 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1057 assert(RC);
1058 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
1059 unsigned SubReg = OpToFold.getSubReg();
1060 if (const TargetRegisterClass *SubRC =
1061 TRI->getSubRegisterClass(RC, SubReg))
1062 RC = SubRC;
1063 }
1064
1065 if (!RC || !TRI->isProperlyAlignedRC(*RC))
1066 return;
1067 }
1068
1069 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1070
1071 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1072 // to enable more folding opportunities. The shrink operands pass
1073 // already does this.
1074 return;
1075 }
1076
1077
1078 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
1079 const TargetRegisterClass *FoldRC =
1080 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
1081
1082 // Split 64-bit constants into 32-bits for folding.
1083 if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
1084 Register UseReg = UseOp->getReg();
1085 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
1086 if (AMDGPU::getRegBitWidth(*UseRC) != 64)
1087 return;
1088
1089 APInt Imm(64, OpToFold.getImm());
1090 if (UseOp->getSubReg() == AMDGPU::sub0) {
1091 Imm = Imm.getLoBits(32);
1092 } else {
1093 assert(UseOp->getSubReg() == AMDGPU::sub1);
1094 Imm = Imm.getHiBits(32);
1095 }
1096
1097 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
1098 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
1099 return;
1100 }
1101
1102 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1103}
1104
1105static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1106 uint32_t LHS, uint32_t RHS) {
1107 switch (Opcode) {
1108 case AMDGPU::V_AND_B32_e64:
1109 case AMDGPU::V_AND_B32_e32:
1110 case AMDGPU::S_AND_B32:
1111 Result = LHS & RHS;
1112 return true;
1113 case AMDGPU::V_OR_B32_e64:
1114 case AMDGPU::V_OR_B32_e32:
1115 case AMDGPU::S_OR_B32:
1116 Result = LHS | RHS;
1117 return true;
1118 case AMDGPU::V_XOR_B32_e64:
1119 case AMDGPU::V_XOR_B32_e32:
1120 case AMDGPU::S_XOR_B32:
1121 Result = LHS ^ RHS;
1122 return true;
1123 case AMDGPU::S_XNOR_B32:
1124 Result = ~(LHS ^ RHS);
1125 return true;
1126 case AMDGPU::S_NAND_B32:
1127 Result = ~(LHS & RHS);
1128 return true;
1129 case AMDGPU::S_NOR_B32:
1130 Result = ~(LHS | RHS);
1131 return true;
1132 case AMDGPU::S_ANDN2_B32:
1133 Result = LHS & ~RHS;
1134 return true;
1135 case AMDGPU::S_ORN2_B32:
1136 Result = LHS | ~RHS;
1137 return true;
1138 case AMDGPU::V_LSHL_B32_e64:
1139 case AMDGPU::V_LSHL_B32_e32:
1140 case AMDGPU::S_LSHL_B32:
1141 // The instruction ignores the high bits for out of bounds shifts.
1142 Result = LHS << (RHS & 31);
1143 return true;
1144 case AMDGPU::V_LSHLREV_B32_e64:
1145 case AMDGPU::V_LSHLREV_B32_e32:
1146 Result = RHS << (LHS & 31);
1147 return true;
1148 case AMDGPU::V_LSHR_B32_e64:
1149 case AMDGPU::V_LSHR_B32_e32:
1150 case AMDGPU::S_LSHR_B32:
1151 Result = LHS >> (RHS & 31);
1152 return true;
1153 case AMDGPU::V_LSHRREV_B32_e64:
1154 case AMDGPU::V_LSHRREV_B32_e32:
1155 Result = RHS >> (LHS & 31);
1156 return true;
1157 case AMDGPU::V_ASHR_I32_e64:
1158 case AMDGPU::V_ASHR_I32_e32:
1159 case AMDGPU::S_ASHR_I32:
1160 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1161 return true;
1162 case AMDGPU::V_ASHRREV_I32_e64:
1163 case AMDGPU::V_ASHRREV_I32_e32:
1164 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1165 return true;
1166 default:
1167 return false;
1168 }
1169}
1170
1171static unsigned getMovOpc(bool IsScalar) {
1172 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1173}
1174
1175static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1176 MI.setDesc(NewDesc);
1177
1178 // Remove any leftover implicit operands from mutating the instruction. e.g.
1179 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1180 // anymore.
1181 const MCInstrDesc &Desc = MI.getDesc();
1182 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1183 Desc.implicit_defs().size();
1184
1185 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1186 MI.removeOperand(I);
1187}
1188
1190SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1191 // If this has a subregister, it obviously is a register source.
1192 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1193 !Op.getReg().isVirtual())
1194 return &Op;
1195
1196 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1197 if (Def && Def->isMoveImmediate()) {
1198 MachineOperand &ImmSrc = Def->getOperand(1);
1199 if (ImmSrc.isImm())
1200 return &ImmSrc;
1201 }
1202
1203 return &Op;
1204}
1205
1206// Try to simplify operations with a constant that may appear after instruction
1207// selection.
1208// TODO: See if a frame index with a fixed offset can fold.
1209bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1210 if (!MI->allImplicitDefsAreDead())
1211 return false;
1212
1213 unsigned Opc = MI->getOpcode();
1214
1215 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1216 if (Src0Idx == -1)
1217 return false;
1218 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1219
1220 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1221 Opc == AMDGPU::S_NOT_B32) &&
1222 Src0->isImm()) {
1223 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1224 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1225 return true;
1226 }
1227
1228 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1229 if (Src1Idx == -1)
1230 return false;
1231 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1232
1233 if (!Src0->isImm() && !Src1->isImm())
1234 return false;
1235
1236 // and k0, k1 -> v_mov_b32 (k0 & k1)
1237 // or k0, k1 -> v_mov_b32 (k0 | k1)
1238 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1239 if (Src0->isImm() && Src1->isImm()) {
1240 int32_t NewImm;
1241 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1242 return false;
1243
1244 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1245
1246 // Be careful to change the right operand, src0 may belong to a different
1247 // instruction.
1248 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1249 MI->removeOperand(Src1Idx);
1250 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1251 return true;
1252 }
1253
1254 if (!MI->isCommutable())
1255 return false;
1256
1257 if (Src0->isImm() && !Src1->isImm()) {
1258 std::swap(Src0, Src1);
1259 std::swap(Src0Idx, Src1Idx);
1260 }
1261
1262 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1263 if (Opc == AMDGPU::V_OR_B32_e64 ||
1264 Opc == AMDGPU::V_OR_B32_e32 ||
1265 Opc == AMDGPU::S_OR_B32) {
1266 if (Src1Val == 0) {
1267 // y = or x, 0 => y = copy x
1268 MI->removeOperand(Src1Idx);
1269 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1270 } else if (Src1Val == -1) {
1271 // y = or x, -1 => y = v_mov_b32 -1
1272 MI->removeOperand(Src1Idx);
1273 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1274 } else
1275 return false;
1276
1277 return true;
1278 }
1279
1280 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1281 Opc == AMDGPU::S_AND_B32) {
1282 if (Src1Val == 0) {
1283 // y = and x, 0 => y = v_mov_b32 0
1284 MI->removeOperand(Src0Idx);
1285 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1286 } else if (Src1Val == -1) {
1287 // y = and x, -1 => y = copy x
1288 MI->removeOperand(Src1Idx);
1289 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1290 } else
1291 return false;
1292
1293 return true;
1294 }
1295
1296 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1297 Opc == AMDGPU::S_XOR_B32) {
1298 if (Src1Val == 0) {
1299 // y = xor x, 0 => y = copy x
1300 MI->removeOperand(Src1Idx);
1301 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1302 return true;
1303 }
1304 }
1305
1306 return false;
1307}
1308
1309// Try to fold an instruction into a simpler one
1310bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1311 unsigned Opc = MI.getOpcode();
1312 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1313 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1314 return false;
1315
1316 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1317 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1318 if (!Src1->isIdenticalTo(*Src0)) {
1319 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1320 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1321 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1322 return false;
1323 }
1324
1325 int Src1ModIdx =
1326 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1327 int Src0ModIdx =
1328 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1329 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1330 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1331 return false;
1332
1333 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1334 auto &NewDesc =
1335 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1336 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1337 if (Src2Idx != -1)
1338 MI.removeOperand(Src2Idx);
1339 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1340 if (Src1ModIdx != -1)
1341 MI.removeOperand(Src1ModIdx);
1342 if (Src0ModIdx != -1)
1343 MI.removeOperand(Src0ModIdx);
1344 mutateCopyOp(MI, NewDesc);
1345 LLVM_DEBUG(dbgs() << MI);
1346 return true;
1347}
1348
1349bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1350 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1351 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1352 return false;
1353
1354 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1355 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1356 return false;
1357
1358 Register Src1 = MI.getOperand(2).getReg();
1359 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1360 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1361 return false;
1362
1363 Register Dst = MI.getOperand(0).getReg();
1364 MRI->replaceRegWith(Dst, Src1);
1365 if (!MI.getOperand(2).isKill())
1366 MRI->clearKillFlags(Src1);
1367 MI.eraseFromParent();
1368 return true;
1369}
1370
1371bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1372 MachineOperand &OpToFold) const {
1373 // We need mutate the operands of new mov instructions to add implicit
1374 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1375 // this.
1376 SmallVector<MachineInstr *, 4> CopiesToReplace;
1378 MachineOperand &Dst = MI.getOperand(0);
1379 bool Changed = false;
1380
1381 if (OpToFold.isImm()) {
1382 for (auto &UseMI :
1383 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1384 // Folding the immediate may reveal operations that can be constant
1385 // folded or replaced with a copy. This can happen for example after
1386 // frame indices are lowered to constants or from splitting 64-bit
1387 // constants.
1388 //
1389 // We may also encounter cases where one or both operands are
1390 // immediates materialized into a register, which would ordinarily not
1391 // be folded due to multiple uses or operand constraints.
1392 if (tryConstantFoldOp(&UseMI)) {
1393 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1394 Changed = true;
1395 }
1396 }
1397 }
1398
1400 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1401 UsesToProcess.push_back(&Use);
1402 for (auto *U : UsesToProcess) {
1403 MachineInstr *UseMI = U->getParent();
1404 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1405 CopiesToReplace);
1406 }
1407
1408 if (CopiesToReplace.empty() && FoldList.empty())
1409 return Changed;
1410
1411 MachineFunction *MF = MI.getParent()->getParent();
1412 // Make sure we add EXEC uses to any new v_mov instructions created.
1413 for (MachineInstr *Copy : CopiesToReplace)
1414 Copy->addImplicitDefUseOperands(*MF);
1415
1416 for (FoldCandidate &Fold : FoldList) {
1417 assert(!Fold.isReg() || Fold.OpToFold);
1418 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1419 Register Reg = Fold.OpToFold->getReg();
1420 MachineInstr *DefMI = Fold.OpToFold->getParent();
1421 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1422 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1423 continue;
1424 }
1425 if (updateOperand(Fold)) {
1426 // Clear kill flags.
1427 if (Fold.isReg()) {
1428 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1429 // FIXME: Probably shouldn't bother trying to fold if not an
1430 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1431 // copies.
1432 MRI->clearKillFlags(Fold.OpToFold->getReg());
1433 }
1434 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1435 << static_cast<int>(Fold.UseOpNo) << " of "
1436 << *Fold.UseMI);
1437 } else if (Fold.Commuted) {
1438 // Restoring instruction's original operand order if fold has failed.
1439 TII->commuteInstruction(*Fold.UseMI, false);
1440 }
1441 }
1442 return true;
1443}
1444
1445bool SIFoldOperands::tryFoldFoldableCopy(
1446 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1447 // Specially track simple redefs of m0 to the same value in a block, so we
1448 // can erase the later ones.
1449 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1450 MachineOperand &NewM0Val = MI.getOperand(1);
1451 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1452 MI.eraseFromParent();
1453 return true;
1454 }
1455
1456 // We aren't tracking other physical registers
1457 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1458 ? nullptr
1459 : &NewM0Val;
1460 return false;
1461 }
1462
1463 MachineOperand &OpToFold = MI.getOperand(1);
1464 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1465
1466 // FIXME: We could also be folding things like TargetIndexes.
1467 if (!FoldingImm && !OpToFold.isReg())
1468 return false;
1469
1470 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1471 return false;
1472
1473 // Prevent folding operands backwards in the function. For example,
1474 // the COPY opcode must not be replaced by 1 in this example:
1475 //
1476 // %3 = COPY %vgpr0; VGPR_32:%3
1477 // ...
1478 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1479 if (!MI.getOperand(0).getReg().isVirtual())
1480 return false;
1481
1482 bool Changed = foldInstOperand(MI, OpToFold);
1483
1484 // If we managed to fold all uses of this copy then we might as well
1485 // delete it now.
1486 // The only reason we need to follow chains of copies here is that
1487 // tryFoldRegSequence looks forward through copies before folding a
1488 // REG_SEQUENCE into its eventual users.
1489 auto *InstToErase = &MI;
1490 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1491 auto &SrcOp = InstToErase->getOperand(1);
1492 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1493 InstToErase->eraseFromParent();
1494 Changed = true;
1495 InstToErase = nullptr;
1496 if (!SrcReg || SrcReg.isPhysical())
1497 break;
1498 InstToErase = MRI->getVRegDef(SrcReg);
1499 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1500 break;
1501 }
1502
1503 if (InstToErase && InstToErase->isRegSequence() &&
1504 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1505 InstToErase->eraseFromParent();
1506 Changed = true;
1507 }
1508
1509 return Changed;
1510}
1511
1512// Clamp patterns are canonically selected to v_max_* instructions, so only
1513// handle them.
1514const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1515 unsigned Op = MI.getOpcode();
1516 switch (Op) {
1517 case AMDGPU::V_MAX_F32_e64:
1518 case AMDGPU::V_MAX_F16_e64:
1519 case AMDGPU::V_MAX_F16_t16_e64:
1520 case AMDGPU::V_MAX_F16_fake16_e64:
1521 case AMDGPU::V_MAX_F64_e64:
1522 case AMDGPU::V_MAX_NUM_F64_e64:
1523 case AMDGPU::V_PK_MAX_F16: {
1524 if (MI.mayRaiseFPException())
1525 return nullptr;
1526
1527 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1528 return nullptr;
1529
1530 // Make sure sources are identical.
1531 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1532 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1533 if (!Src0->isReg() || !Src1->isReg() ||
1534 Src0->getReg() != Src1->getReg() ||
1535 Src0->getSubReg() != Src1->getSubReg() ||
1536 Src0->getSubReg() != AMDGPU::NoSubRegister)
1537 return nullptr;
1538
1539 // Can't fold up if we have modifiers.
1540 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1541 return nullptr;
1542
1543 unsigned Src0Mods
1544 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1545 unsigned Src1Mods
1546 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1547
1548 // Having a 0 op_sel_hi would require swizzling the output in the source
1549 // instruction, which we can't do.
1550 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1551 : 0u;
1552 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1553 return nullptr;
1554 return Src0;
1555 }
1556 default:
1557 return nullptr;
1558 }
1559}
1560
1561// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1562bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1563 const MachineOperand *ClampSrc = isClamp(MI);
1564 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1565 return false;
1566
1567 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1568
1569 // The type of clamp must be compatible.
1570 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1571 return false;
1572
1573 if (Def->mayRaiseFPException())
1574 return false;
1575
1576 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1577 if (!DefClamp)
1578 return false;
1579
1580 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1581
1582 // Clamp is applied after omod, so it is OK if omod is set.
1583 DefClamp->setImm(1);
1584
1585 Register DefReg = Def->getOperand(0).getReg();
1586 Register MIDstReg = MI.getOperand(0).getReg();
1587 if (TRI->isSGPRReg(*MRI, DefReg)) {
1588 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
1589 // instruction with a VGPR dst.
1590 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
1591 MIDstReg)
1592 .addReg(DefReg);
1593 } else {
1594 MRI->replaceRegWith(MIDstReg, DefReg);
1595 }
1596 MI.eraseFromParent();
1597
1598 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1599 // instruction, so we might as well convert it to the more flexible VOP3-only
1600 // mad/fma form.
1601 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1602 Def->eraseFromParent();
1603
1604 return true;
1605}
1606
1607static int getOModValue(unsigned Opc, int64_t Val) {
1608 switch (Opc) {
1609 case AMDGPU::V_MUL_F64_e64:
1610 case AMDGPU::V_MUL_F64_pseudo_e64: {
1611 switch (Val) {
1612 case 0x3fe0000000000000: // 0.5
1613 return SIOutMods::DIV2;
1614 case 0x4000000000000000: // 2.0
1615 return SIOutMods::MUL2;
1616 case 0x4010000000000000: // 4.0
1617 return SIOutMods::MUL4;
1618 default:
1619 return SIOutMods::NONE;
1620 }
1621 }
1622 case AMDGPU::V_MUL_F32_e64: {
1623 switch (static_cast<uint32_t>(Val)) {
1624 case 0x3f000000: // 0.5
1625 return SIOutMods::DIV2;
1626 case 0x40000000: // 2.0
1627 return SIOutMods::MUL2;
1628 case 0x40800000: // 4.0
1629 return SIOutMods::MUL4;
1630 default:
1631 return SIOutMods::NONE;
1632 }
1633 }
1634 case AMDGPU::V_MUL_F16_e64:
1635 case AMDGPU::V_MUL_F16_t16_e64:
1636 case AMDGPU::V_MUL_F16_fake16_e64: {
1637 switch (static_cast<uint16_t>(Val)) {
1638 case 0x3800: // 0.5
1639 return SIOutMods::DIV2;
1640 case 0x4000: // 2.0
1641 return SIOutMods::MUL2;
1642 case 0x4400: // 4.0
1643 return SIOutMods::MUL4;
1644 default:
1645 return SIOutMods::NONE;
1646 }
1647 }
1648 default:
1649 llvm_unreachable("invalid mul opcode");
1650 }
1651}
1652
1653// FIXME: Does this really not support denormals with f16?
1654// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1655// handled, so will anything other than that break?
1656std::pair<const MachineOperand *, int>
1657SIFoldOperands::isOMod(const MachineInstr &MI) const {
1658 unsigned Op = MI.getOpcode();
1659 switch (Op) {
1660 case AMDGPU::V_MUL_F64_e64:
1661 case AMDGPU::V_MUL_F64_pseudo_e64:
1662 case AMDGPU::V_MUL_F32_e64:
1663 case AMDGPU::V_MUL_F16_t16_e64:
1664 case AMDGPU::V_MUL_F16_fake16_e64:
1665 case AMDGPU::V_MUL_F16_e64: {
1666 // If output denormals are enabled, omod is ignored.
1667 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1668 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1669 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1670 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
1671 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1672 MFI->getMode().FP64FP16Denormals.Output !=
1674 MI.mayRaiseFPException())
1675 return std::pair(nullptr, SIOutMods::NONE);
1676
1677 const MachineOperand *RegOp = nullptr;
1678 const MachineOperand *ImmOp = nullptr;
1679 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1680 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1681 if (Src0->isImm()) {
1682 ImmOp = Src0;
1683 RegOp = Src1;
1684 } else if (Src1->isImm()) {
1685 ImmOp = Src1;
1686 RegOp = Src0;
1687 } else
1688 return std::pair(nullptr, SIOutMods::NONE);
1689
1690 int OMod = getOModValue(Op, ImmOp->getImm());
1691 if (OMod == SIOutMods::NONE ||
1692 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1693 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1694 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1695 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1696 return std::pair(nullptr, SIOutMods::NONE);
1697
1698 return std::pair(RegOp, OMod);
1699 }
1700 case AMDGPU::V_ADD_F64_e64:
1701 case AMDGPU::V_ADD_F64_pseudo_e64:
1702 case AMDGPU::V_ADD_F32_e64:
1703 case AMDGPU::V_ADD_F16_e64:
1704 case AMDGPU::V_ADD_F16_t16_e64:
1705 case AMDGPU::V_ADD_F16_fake16_e64: {
1706 // If output denormals are enabled, omod is ignored.
1707 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1708 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1709 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1710 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
1711 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1712 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1713 return std::pair(nullptr, SIOutMods::NONE);
1714
1715 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1716 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1717 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1718
1719 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1720 Src0->getSubReg() == Src1->getSubReg() &&
1721 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1722 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1723 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1724 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1725 return std::pair(Src0, SIOutMods::MUL2);
1726
1727 return std::pair(nullptr, SIOutMods::NONE);
1728 }
1729 default:
1730 return std::pair(nullptr, SIOutMods::NONE);
1731 }
1732}
1733
1734// FIXME: Does this need to check IEEE bit on function?
1735bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1736 const MachineOperand *RegOp;
1737 int OMod;
1738 std::tie(RegOp, OMod) = isOMod(MI);
1739 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1740 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1741 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1742 return false;
1743
1744 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1745 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1746 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1747 return false;
1748
1749 if (Def->mayRaiseFPException())
1750 return false;
1751
1752 // Clamp is applied after omod. If the source already has clamp set, don't
1753 // fold it.
1754 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1755 return false;
1756
1757 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1758
1759 DefOMod->setImm(OMod);
1760 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1761 MI.eraseFromParent();
1762
1763 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1764 // instruction, so we might as well convert it to the more flexible VOP3-only
1765 // mad/fma form.
1766 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1767 Def->eraseFromParent();
1768
1769 return true;
1770}
1771
1772// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1773// instruction which can take an agpr. So far that means a store.
1774bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1775 assert(MI.isRegSequence());
1776 auto Reg = MI.getOperand(0).getReg();
1777
1778 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1779 !MRI->hasOneNonDBGUse(Reg))
1780 return false;
1781
1783 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1784 return false;
1785
1786 for (auto &[Op, SubIdx] : Defs) {
1787 if (!Op->isReg())
1788 return false;
1789 if (TRI->isAGPR(*MRI, Op->getReg()))
1790 continue;
1791 // Maybe this is a COPY from AREG
1792 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1793 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1794 return false;
1795 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1796 return false;
1797 }
1798
1799 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1800 MachineInstr *UseMI = Op->getParent();
1801 while (UseMI->isCopy() && !Op->getSubReg()) {
1802 Reg = UseMI->getOperand(0).getReg();
1803 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1804 return false;
1805 Op = &*MRI->use_nodbg_begin(Reg);
1806 UseMI = Op->getParent();
1807 }
1808
1809 if (Op->getSubReg())
1810 return false;
1811
1812 unsigned OpIdx = Op - &UseMI->getOperand(0);
1813 const MCInstrDesc &InstDesc = UseMI->getDesc();
1814 const TargetRegisterClass *OpRC =
1815 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1816 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1817 return false;
1818
1819 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1820 auto Dst = MRI->createVirtualRegister(NewDstRC);
1821 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1822 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1823
1824 for (auto &[Def, SubIdx] : Defs) {
1825 Def->setIsKill(false);
1826 if (TRI->isAGPR(*MRI, Def->getReg())) {
1827 RS.add(*Def);
1828 } else { // This is a copy
1829 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1830 SubDef->getOperand(1).setIsKill(false);
1831 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1832 }
1833 RS.addImm(SubIdx);
1834 }
1835
1836 Op->setReg(Dst);
1837 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1838 Op->setReg(Reg);
1839 RS->eraseFromParent();
1840 return false;
1841 }
1842
1843 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1844
1845 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1846 // in which case we can erase them all later in runOnMachineFunction.
1847 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1848 MI.eraseFromParent();
1849 return true;
1850}
1851
1852/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1853/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1854static bool isAGPRCopy(const SIRegisterInfo &TRI,
1855 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1856 Register &OutReg, unsigned &OutSubReg) {
1857 assert(Copy.isCopy());
1858
1859 const MachineOperand &CopySrc = Copy.getOperand(1);
1860 Register CopySrcReg = CopySrc.getReg();
1861 if (!CopySrcReg.isVirtual())
1862 return false;
1863
1864 // Common case: copy from AGPR directly, e.g.
1865 // %1:vgpr_32 = COPY %0:agpr_32
1866 if (TRI.isAGPR(MRI, CopySrcReg)) {
1867 OutReg = CopySrcReg;
1868 OutSubReg = CopySrc.getSubReg();
1869 return true;
1870 }
1871
1872 // Sometimes it can also involve two copies, e.g.
1873 // %1:vgpr_256 = COPY %0:agpr_256
1874 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
1875 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
1876 if (!CopySrcDef || !CopySrcDef->isCopy())
1877 return false;
1878
1879 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
1880 Register OtherCopySrcReg = OtherCopySrc.getReg();
1881 if (!OtherCopySrcReg.isVirtual() ||
1882 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
1883 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
1884 !TRI.isAGPR(MRI, OtherCopySrcReg))
1885 return false;
1886
1887 OutReg = OtherCopySrcReg;
1888 OutSubReg = CopySrc.getSubReg();
1889 return true;
1890}
1891
1892// Try to hoist an AGPR to VGPR copy across a PHI.
1893// This should allow folding of an AGPR into a consumer which may support it.
1894//
1895// Example 1: LCSSA PHI
1896// loop:
1897// %1:vreg = COPY %0:areg
1898// exit:
1899// %2:vreg = PHI %1:vreg, %loop
1900// =>
1901// loop:
1902// exit:
1903// %1:areg = PHI %0:areg, %loop
1904// %2:vreg = COPY %1:areg
1905//
1906// Example 2: PHI with multiple incoming values:
1907// entry:
1908// %1:vreg = GLOBAL_LOAD(..)
1909// loop:
1910// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1911// %3:areg = COPY %2:vreg
1912// %4:areg = (instr using %3:areg)
1913// %5:vreg = COPY %4:areg
1914// =>
1915// entry:
1916// %1:vreg = GLOBAL_LOAD(..)
1917// %2:areg = COPY %1:vreg
1918// loop:
1919// %3:areg = PHI %2:areg, %entry, %X:areg,
1920// %4:areg = (instr using %3:areg)
1921bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1922 assert(PHI.isPHI());
1923
1924 Register PhiOut = PHI.getOperand(0).getReg();
1925 if (!TRI->isVGPR(*MRI, PhiOut))
1926 return false;
1927
1928 // Iterate once over all incoming values of the PHI to check if this PHI is
1929 // eligible, and determine the exact AGPR RC we'll target.
1930 const TargetRegisterClass *ARC = nullptr;
1931 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1932 MachineOperand &MO = PHI.getOperand(K);
1933 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
1934 if (!Copy || !Copy->isCopy())
1935 continue;
1936
1937 Register AGPRSrc;
1938 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1939 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
1940 continue;
1941
1942 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
1943 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1944 CopyInRC = SubRC;
1945
1946 if (ARC && !ARC->hasSubClassEq(CopyInRC))
1947 return false;
1948 ARC = CopyInRC;
1949 }
1950
1951 if (!ARC)
1952 return false;
1953
1954 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1955
1956 // Rewrite the PHI's incoming values to ARC.
1957 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1958 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1959 MachineOperand &MO = PHI.getOperand(K);
1960 Register Reg = MO.getReg();
1961
1963 MachineBasicBlock *InsertMBB = nullptr;
1964
1965 // Look at the def of Reg, ignoring all copies.
1966 unsigned CopyOpc = AMDGPU::COPY;
1967 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1968
1969 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1970 // the copy was single-use, it will be removed by DCE later.
1971 if (Def->isCopy()) {
1972 Register AGPRSrc;
1973 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1974 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
1975 MO.setReg(AGPRSrc);
1976 MO.setSubReg(AGPRSubReg);
1977 continue;
1978 }
1979
1980 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1981 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1982 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1983 // is unlikely to be profitable.
1984 //
1985 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1986 MachineOperand &CopyIn = Def->getOperand(1);
1987 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1988 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1989 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1990 }
1991
1992 InsertMBB = Def->getParent();
1993 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
1994 } else {
1995 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1996 InsertPt = InsertMBB->getFirstTerminator();
1997 }
1998
1999 Register NewReg = MRI->createVirtualRegister(ARC);
2000 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2001 TII->get(CopyOpc), NewReg)
2002 .addReg(Reg);
2003 MO.setReg(NewReg);
2004
2005 (void)MI;
2006 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2007 }
2008
2009 // Replace the PHI's result with a new register.
2010 Register NewReg = MRI->createVirtualRegister(ARC);
2011 PHI.getOperand(0).setReg(NewReg);
2012
2013 // COPY that new register back to the original PhiOut register. This COPY will
2014 // usually be folded out later.
2015 MachineBasicBlock *MBB = PHI.getParent();
2016 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2017 TII->get(AMDGPU::COPY), PhiOut)
2018 .addReg(NewReg);
2019
2020 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2021 return true;
2022}
2023
2024// Attempt to convert VGPR load to an AGPR load.
2025bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
2026 assert(MI.mayLoad());
2027 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2028 return false;
2029
2030 MachineOperand &Def = MI.getOperand(0);
2031 if (!Def.isDef())
2032 return false;
2033
2034 Register DefReg = Def.getReg();
2035
2036 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2037 return false;
2038
2040 SmallVector<Register, 8> MoveRegs;
2041 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
2042 Users.push_back(&I);
2043
2044 if (Users.empty())
2045 return false;
2046
2047 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2048 while (!Users.empty()) {
2049 const MachineInstr *I = Users.pop_back_val();
2050 if (!I->isCopy() && !I->isRegSequence())
2051 return false;
2052 Register DstReg = I->getOperand(0).getReg();
2053 // Physical registers may have more than one instruction definitions
2054 if (DstReg.isPhysical())
2055 return false;
2056 if (TRI->isAGPR(*MRI, DstReg))
2057 continue;
2058 MoveRegs.push_back(DstReg);
2059 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2060 Users.push_back(&U);
2061 }
2062
2063 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2064 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2065 if (!TII->isOperandLegal(MI, 0, &Def)) {
2066 MRI->setRegClass(DefReg, RC);
2067 return false;
2068 }
2069
2070 while (!MoveRegs.empty()) {
2071 Register Reg = MoveRegs.pop_back_val();
2072 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2073 }
2074
2075 LLVM_DEBUG(dbgs() << "Folded " << MI);
2076
2077 return true;
2078}
2079
2080// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2081// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2082// there's cases where it can create a lot more AGPR-AGPR copies, which are
2083// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2084//
2085// This function looks at all AGPR PHIs in a basic block and collects their
2086// operands. Then, it checks for register that are used more than once across
2087// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2088// having to create one VGPR temporary per use, which can get very messy if
2089// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2090// element).
2091//
2092// Example
2093// a:
2094// %in:agpr_256 = COPY %foo:vgpr_256
2095// c:
2096// %x:agpr_32 = ..
2097// b:
2098// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2099// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2100// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2101// =>
2102// a:
2103// %in:agpr_256 = COPY %foo:vgpr_256
2104// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2105// %tmp_agpr:agpr_32 = COPY %tmp
2106// c:
2107// %x:agpr_32 = ..
2108// b:
2109// %0:areg = PHI %tmp_agpr, %a, %x, %c
2110// %1:areg = PHI %tmp_agpr, %a, %y, %c
2111// %2:areg = PHI %tmp_agpr, %a, %z, %c
2112bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2113 // This is only really needed on GFX908 where AGPR-AGPR copies are
2114 // unreasonably difficult.
2115 if (ST->hasGFX90AInsts())
2116 return false;
2117
2118 // Look at all AGPR Phis and collect the register + subregister used.
2119 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2120 RegToMO;
2121
2122 for (auto &MI : MBB) {
2123 if (!MI.isPHI())
2124 break;
2125
2126 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2127 continue;
2128
2129 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2130 MachineOperand &PhiMO = MI.getOperand(K);
2131 if (!PhiMO.getSubReg())
2132 continue;
2133 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2134 }
2135 }
2136
2137 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2138 // a VGPR.
2139 bool Changed = false;
2140 for (const auto &[Entry, MOs] : RegToMO) {
2141 if (MOs.size() == 1)
2142 continue;
2143
2144 const auto [Reg, SubReg] = Entry;
2145 MachineInstr *Def = MRI->getVRegDef(Reg);
2146 MachineBasicBlock *DefMBB = Def->getParent();
2147
2148 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2149 // out.
2150 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2151 Register TempVGPR =
2152 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2153 MachineInstr *VGPRCopy =
2154 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2155 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2156 .addReg(Reg, /* flags */ 0, SubReg);
2157
2158 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2159 Register TempAGPR = MRI->createVirtualRegister(ARC);
2160 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2161 TII->get(AMDGPU::COPY), TempAGPR)
2162 .addReg(TempVGPR);
2163
2164 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2165 for (MachineOperand *MO : MOs) {
2166 MO->setReg(TempAGPR);
2167 MO->setSubReg(AMDGPU::NoSubRegister);
2168 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2169 }
2170
2171 Changed = true;
2172 }
2173
2174 return Changed;
2175}
2176
2177bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
2178 if (skipFunction(MF.getFunction()))
2179 return false;
2180
2181 MRI = &MF.getRegInfo();
2182 ST = &MF.getSubtarget<GCNSubtarget>();
2183 TII = ST->getInstrInfo();
2184 TRI = &TII->getRegisterInfo();
2185 MFI = MF.getInfo<SIMachineFunctionInfo>();
2186
2187 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2188 // correctly handle signed zeros.
2189 //
2190 // FIXME: Also need to check strictfp
2191 bool IsIEEEMode = MFI->getMode().IEEE;
2192 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2193
2194 bool Changed = false;
2195 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2196 MachineOperand *CurrentKnownM0Val = nullptr;
2197 for (auto &MI : make_early_inc_range(*MBB)) {
2198 Changed |= tryFoldCndMask(MI);
2199
2200 if (tryFoldZeroHighBits(MI)) {
2201 Changed = true;
2202 continue;
2203 }
2204
2205 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2206 Changed = true;
2207 continue;
2208 }
2209
2210 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2211 Changed = true;
2212 continue;
2213 }
2214
2215 if (MI.mayLoad() && tryFoldLoad(MI)) {
2216 Changed = true;
2217 continue;
2218 }
2219
2220 if (TII->isFoldableCopy(MI)) {
2221 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2222 continue;
2223 }
2224
2225 // Saw an unknown clobber of m0, so we no longer know what it is.
2226 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2227 CurrentKnownM0Val = nullptr;
2228
2229 // TODO: Omod might be OK if there is NSZ only on the source
2230 // instruction, and not the omod multiply.
2231 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2232 !tryFoldOMod(MI))
2233 Changed |= tryFoldClamp(MI);
2234 }
2235
2236 Changed |= tryOptimizeAGPRPhis(*MBB);
2237 }
2238
2239 return Changed;
2240}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
Module * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
#define DEBUG_TYPE
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
Class for arbitrary precision integers.
Definition: APInt.h:78
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:261
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:775
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:566
bool isRegSequence() const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:699
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:951
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo)
Does this operand support only inlinable literals?
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition: COFF.h:826
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1454
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
char & SIFoldOperandsID
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.