LLVM 19.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "AMDGPU.h"
12#include "GCNSubtarget.h"
18
19#define DEBUG_TYPE "si-fold-operands"
20using namespace llvm;
21
22namespace {
23
24struct FoldCandidate {
26 union {
27 MachineOperand *OpToFold;
28 uint64_t ImmToFold;
29 int FrameIndexToFold;
30 };
31 int ShrinkOpcode;
32 unsigned UseOpNo;
34 bool Commuted;
35
36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37 bool Commuted_ = false,
38 int ShrinkOp = -1) :
39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40 Kind(FoldOp->getType()),
41 Commuted(Commuted_) {
42 if (FoldOp->isImm()) {
43 ImmToFold = FoldOp->getImm();
44 } else if (FoldOp->isFI()) {
45 FrameIndexToFold = FoldOp->getIndex();
46 } else {
47 assert(FoldOp->isReg() || FoldOp->isGlobal());
48 OpToFold = FoldOp;
49 }
50 }
51
52 bool isFI() const {
53 return Kind == MachineOperand::MO_FrameIndex;
54 }
55
56 bool isImm() const {
57 return Kind == MachineOperand::MO_Immediate;
58 }
59
60 bool isReg() const {
61 return Kind == MachineOperand::MO_Register;
62 }
63
64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65
66 bool needsShrink() const { return ShrinkOpcode != -1; }
67};
68
69class SIFoldOperands : public MachineFunctionPass {
70public:
71 static char ID;
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 bool updateOperand(FoldCandidate &Fold) const;
82
83 bool canUseImmWithOpSel(FoldCandidate &Fold) const;
84
85 bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
86
87 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
88 MachineInstr *MI, unsigned OpNo,
89 MachineOperand *OpToFold) const;
90 bool isUseSafeToFold(const MachineInstr &MI,
91 const MachineOperand &UseMO) const;
92 bool
93 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
94 Register UseReg, uint8_t OpTy) const;
95 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
96 unsigned UseOpIdx,
97 SmallVectorImpl<FoldCandidate> &FoldList) const;
98 void foldOperand(MachineOperand &OpToFold,
100 int UseOpIdx,
102 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
103
104 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
105 bool tryConstantFoldOp(MachineInstr *MI) const;
106 bool tryFoldCndMask(MachineInstr &MI) const;
107 bool tryFoldZeroHighBits(MachineInstr &MI) const;
108 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
109 bool tryFoldFoldableCopy(MachineInstr &MI,
110 MachineOperand *&CurrentKnownM0Val) const;
111
112 const MachineOperand *isClamp(const MachineInstr &MI) const;
113 bool tryFoldClamp(MachineInstr &MI);
114
115 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
116 bool tryFoldOMod(MachineInstr &MI);
117 bool tryFoldRegSequence(MachineInstr &MI);
118 bool tryFoldPhiAGPR(MachineInstr &MI);
119 bool tryFoldLoad(MachineInstr &MI);
120
121 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
122
123public:
124 SIFoldOperands() : MachineFunctionPass(ID) {
126 }
127
128 bool runOnMachineFunction(MachineFunction &MF) override;
129
130 StringRef getPassName() const override { return "SI Fold Operands"; }
131
132 void getAnalysisUsage(AnalysisUsage &AU) const override {
133 AU.setPreservesCFG();
135 }
136};
137
138} // End anonymous namespace.
139
140INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
141 "SI Fold Operands", false, false)
142
143char SIFoldOperands::ID = 0;
144
145char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
146
149 const MachineOperand &MO) {
150 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
151 if (const TargetRegisterClass *SubRC =
152 TRI.getSubRegisterClass(RC, MO.getSubReg()))
153 RC = SubRC;
154 return RC;
155}
156
157// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
158static unsigned macToMad(unsigned Opc) {
159 switch (Opc) {
160 case AMDGPU::V_MAC_F32_e64:
161 return AMDGPU::V_MAD_F32_e64;
162 case AMDGPU::V_MAC_F16_e64:
163 return AMDGPU::V_MAD_F16_e64;
164 case AMDGPU::V_FMAC_F32_e64:
165 return AMDGPU::V_FMA_F32_e64;
166 case AMDGPU::V_FMAC_F16_e64:
167 return AMDGPU::V_FMA_F16_gfx9_e64;
168 case AMDGPU::V_FMAC_F16_t16_e64:
169 return AMDGPU::V_FMA_F16_gfx9_e64;
170 case AMDGPU::V_FMAC_LEGACY_F32_e64:
171 return AMDGPU::V_FMA_LEGACY_F32_e64;
172 case AMDGPU::V_FMAC_F64_e64:
173 return AMDGPU::V_FMA_F64_e64;
174 }
175 return AMDGPU::INSTRUCTION_LIST_END;
176}
177
178// TODO: Add heuristic that the frame index might not fit in the addressing mode
179// immediate offset to avoid materializing in loops.
180bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
181 const MachineOperand &OpToFold) const {
182 if (!OpToFold.isFI())
183 return false;
184
185 const unsigned Opc = UseMI.getOpcode();
186 if (TII->isMUBUF(UseMI))
187 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
188 if (!TII->isFLATScratch(UseMI))
189 return false;
190
191 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
192 if (OpNo == SIdx)
193 return true;
194
195 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
196 return OpNo == VIdx && SIdx == -1;
197}
198
200 return new SIFoldOperands();
201}
202
203bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
204 MachineInstr *MI = Fold.UseMI;
205 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
206 const uint64_t TSFlags = MI->getDesc().TSFlags;
207
208 assert(Old.isReg() && Fold.isImm());
209
210 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
211 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
212 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
213 return false;
214
215 unsigned Opcode = MI->getOpcode();
216 int OpNo = MI->getOperandNo(&Old);
217 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
218 switch (OpType) {
219 default:
220 return false;
227 break;
228 }
229
230 return true;
231}
232
233bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
234 MachineInstr *MI = Fold.UseMI;
235 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
236 unsigned Opcode = MI->getOpcode();
237 int OpNo = MI->getOperandNo(&Old);
238 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
239
240 // If the literal can be inlined as-is, apply it and short-circuit the
241 // tests below. The main motivation for this is to avoid unintuitive
242 // uses of opsel.
243 if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
244 Old.ChangeToImmediate(Fold.ImmToFold);
245 return true;
246 }
247
248 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
249 // op_sel in a way that allows an inline constant.
250 int ModIdx = -1;
251 unsigned SrcIdx = ~0;
252 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
253 ModIdx = AMDGPU::OpName::src0_modifiers;
254 SrcIdx = 0;
255 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
256 ModIdx = AMDGPU::OpName::src1_modifiers;
257 SrcIdx = 1;
258 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
259 ModIdx = AMDGPU::OpName::src2_modifiers;
260 SrcIdx = 2;
261 }
262 assert(ModIdx != -1);
263 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
264 MachineOperand &Mod = MI->getOperand(ModIdx);
265 unsigned ModVal = Mod.getImm();
266
267 uint16_t ImmLo = static_cast<uint16_t>(
268 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
269 uint16_t ImmHi = static_cast<uint16_t>(
270 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
271 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
272 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
273
274 // Helper function that attempts to inline the given value with a newly
275 // chosen opsel pattern.
276 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
277 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
278 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
279 Old.ChangeToImmediate(Imm);
280 return true;
281 }
282
283 // Try to shuffle the halves around and leverage opsel to get an inline
284 // constant.
285 uint16_t Lo = static_cast<uint16_t>(Imm);
286 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
287 if (Lo == Hi) {
288 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
289 Mod.setImm(NewModVal);
291 return true;
292 }
293
294 if (static_cast<int16_t>(Lo) < 0) {
295 int32_t SExt = static_cast<int16_t>(Lo);
296 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
297 Mod.setImm(NewModVal);
298 Old.ChangeToImmediate(SExt);
299 return true;
300 }
301 }
302
303 // This check is only useful for integer instructions
304 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
306 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
307 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
308 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
309 return true;
310 }
311 }
312 } else {
313 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
314 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
315 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
316 Old.ChangeToImmediate(Swapped);
317 return true;
318 }
319 }
320
321 return false;
322 };
323
324 if (tryFoldToInline(Imm))
325 return true;
326
327 // Replace integer addition by subtraction and vice versa if it allows
328 // folding the immediate to an inline constant.
329 //
330 // We should only ever get here for SrcIdx == 1 due to canonicalization
331 // earlier in the pipeline, but we double-check here to be safe / fully
332 // general.
333 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
334 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
335 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
336 unsigned ClampIdx =
337 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
338 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
339
340 if (!Clamp) {
341 uint16_t NegLo = -static_cast<uint16_t>(Imm);
342 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
343 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
344
345 if (tryFoldToInline(NegImm)) {
346 unsigned NegOpcode =
347 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
348 MI->setDesc(TII->get(NegOpcode));
349 return true;
350 }
351 }
352 }
353
354 return false;
355}
356
357bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
358 MachineInstr *MI = Fold.UseMI;
359 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
360 assert(Old.isReg());
361
362 if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
363 if (tryFoldImmWithOpSel(Fold))
364 return true;
365
366 // We can't represent the candidate as an inline constant. Try as a literal
367 // with the original opsel, checking constant bus limitations.
369 int OpNo = MI->getOperandNo(&Old);
370 if (!TII->isOperandLegal(*MI, OpNo, &New))
371 return false;
372 Old.ChangeToImmediate(Fold.ImmToFold);
373 return true;
374 }
375
376 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
377 MachineBasicBlock *MBB = MI->getParent();
378 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
379 if (Liveness != MachineBasicBlock::LQR_Dead) {
380 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
381 return false;
382 }
383
384 int Op32 = Fold.ShrinkOpcode;
385 MachineOperand &Dst0 = MI->getOperand(0);
386 MachineOperand &Dst1 = MI->getOperand(1);
387 assert(Dst0.isDef() && Dst1.isDef());
388
389 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
390
391 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
392 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
393
394 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
395
396 if (HaveNonDbgCarryUse) {
397 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
398 Dst1.getReg())
399 .addReg(AMDGPU::VCC, RegState::Kill);
400 }
401
402 // Keep the old instruction around to avoid breaking iterators, but
403 // replace it with a dummy instruction to remove uses.
404 //
405 // FIXME: We should not invert how this pass looks at operands to avoid
406 // this. Should track set of foldable movs instead of looking for uses
407 // when looking at a use.
408 Dst0.setReg(NewReg0);
409 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
410 MI->removeOperand(I);
411 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
412
413 if (Fold.Commuted)
414 TII->commuteInstruction(*Inst32, false);
415 return true;
416 }
417
418 assert(!Fold.needsShrink() && "not handled");
419
420 if (Fold.isImm()) {
421 if (Old.isTied()) {
422 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
423 if (NewMFMAOpc == -1)
424 return false;
425 MI->setDesc(TII->get(NewMFMAOpc));
426 MI->untieRegOperand(0);
427 }
428 Old.ChangeToImmediate(Fold.ImmToFold);
429 return true;
430 }
431
432 if (Fold.isGlobal()) {
433 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
434 Fold.OpToFold->getTargetFlags());
435 return true;
436 }
437
438 if (Fold.isFI()) {
439 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
440 return true;
441 }
442
443 MachineOperand *New = Fold.OpToFold;
444 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
445 Old.setIsUndef(New->isUndef());
446 return true;
447}
448
450 const MachineInstr *MI) {
451 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
452}
453
455 MachineInstr *MI, unsigned OpNo,
456 MachineOperand *FoldOp, bool Commuted = false,
457 int ShrinkOp = -1) {
458 // Skip additional folding on the same operand.
459 for (FoldCandidate &Fold : FoldList)
460 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
461 return;
462 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
463 << " operand " << OpNo << "\n " << *MI);
464 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
465}
466
467bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
468 MachineInstr *MI, unsigned OpNo,
469 MachineOperand *OpToFold) const {
470 const unsigned Opc = MI->getOpcode();
471
472 auto tryToFoldAsFMAAKorMK = [&]() {
473 if (!OpToFold->isImm())
474 return false;
475
476 const bool TryAK = OpNo == 3;
477 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
478 MI->setDesc(TII->get(NewOpc));
479
480 // We have to fold into operand which would be Imm not into OpNo.
481 bool FoldAsFMAAKorMK =
482 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
483 if (FoldAsFMAAKorMK) {
484 // Untie Src2 of fmac.
485 MI->untieRegOperand(3);
486 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
487 if (OpNo == 1) {
488 MachineOperand &Op1 = MI->getOperand(1);
489 MachineOperand &Op2 = MI->getOperand(2);
490 Register OldReg = Op1.getReg();
491 // Operand 2 might be an inlinable constant
492 if (Op2.isImm()) {
493 Op1.ChangeToImmediate(Op2.getImm());
494 Op2.ChangeToRegister(OldReg, false);
495 } else {
496 Op1.setReg(Op2.getReg());
497 Op2.setReg(OldReg);
498 }
499 }
500 return true;
501 }
502 MI->setDesc(TII->get(Opc));
503 return false;
504 };
505
506 bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
507 if (!IsLegal && OpToFold->isImm()) {
508 FoldCandidate Fold(MI, OpNo, OpToFold);
509 IsLegal = canUseImmWithOpSel(Fold);
510 }
511
512 if (!IsLegal) {
513 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
514 unsigned NewOpc = macToMad(Opc);
515 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
516 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
517 // to fold the operand.
518 MI->setDesc(TII->get(NewOpc));
519 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
520 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
521 if (AddOpSel)
522 MI->addOperand(MachineOperand::CreateImm(0));
523 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
524 if (FoldAsMAD) {
525 MI->untieRegOperand(OpNo);
526 return true;
527 }
528 if (AddOpSel)
529 MI->removeOperand(MI->getNumExplicitOperands() - 1);
530 MI->setDesc(TII->get(Opc));
531 }
532
533 // Special case for s_fmac_f32 if we are trying to fold into Src2.
534 // By transforming into fmaak we can untie Src2 and make folding legal.
535 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
536 if (tryToFoldAsFMAAKorMK())
537 return true;
538 }
539
540 // Special case for s_setreg_b32
541 if (OpToFold->isImm()) {
542 unsigned ImmOpc = 0;
543 if (Opc == AMDGPU::S_SETREG_B32)
544 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
545 else if (Opc == AMDGPU::S_SETREG_B32_mode)
546 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
547 if (ImmOpc) {
548 MI->setDesc(TII->get(ImmOpc));
549 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
550 return true;
551 }
552 }
553
554 // If we are already folding into another operand of MI, then
555 // we can't commute the instruction, otherwise we risk making the
556 // other fold illegal.
557 if (isUseMIInFoldList(FoldList, MI))
558 return false;
559
560 // Operand is not legal, so try to commute the instruction to
561 // see if this makes it possible to fold.
562 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
563 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
564 if (!CanCommute)
565 return false;
566
567 // One of operands might be an Imm operand, and OpNo may refer to it after
568 // the call of commuteInstruction() below. Such situations are avoided
569 // here explicitly as OpNo must be a register operand to be a candidate
570 // for memory folding.
571 if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
572 return false;
573
574 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
575 return false;
576
577 int Op32 = -1;
578 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
579 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
580 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
581 (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
582 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
583 return false;
584 }
585
586 // Verify the other operand is a VGPR, otherwise we would violate the
587 // constant bus restriction.
588 MachineOperand &OtherOp = MI->getOperand(OpNo);
589 if (!OtherOp.isReg() ||
590 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
591 return false;
592
593 assert(MI->getOperand(1).isDef());
594
595 // Make sure to get the 32-bit version of the commuted opcode.
596 unsigned MaybeCommutedOpc = MI->getOpcode();
597 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
598 }
599
600 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
601 return true;
602 }
603
604 // Inlineable constant might have been folded into Imm operand of fmaak or
605 // fmamk and we are trying to fold a non-inlinable constant.
606 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
607 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
608 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
609 MachineOperand &OpImm = MI->getOperand(ImmIdx);
610 if (!OpImm.isReg() &&
611 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
612 return tryToFoldAsFMAAKorMK();
613 }
614
615 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
616 // By changing into fmamk we can untie Src2.
617 // If folding for Src0 happens first and it is identical operand to Src1 we
618 // should avoid transforming into fmamk which requires commuting as it would
619 // cause folding into Src1 to fail later on due to wrong OpNo used.
620 if (Opc == AMDGPU::S_FMAC_F32 &&
621 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
622 if (tryToFoldAsFMAAKorMK())
623 return true;
624 }
625
626 // Check the case where we might introduce a second constant operand to a
627 // scalar instruction
628 if (TII->isSALU(MI->getOpcode())) {
629 const MCInstrDesc &InstDesc = MI->getDesc();
630 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
631
632 // Fine if the operand can be encoded as an inline constant
633 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
634 // Otherwise check for another constant
635 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
636 auto &Op = MI->getOperand(i);
637 if (OpNo != i && !Op.isReg() &&
638 !TII->isInlineConstant(Op, InstDesc.operands()[i]))
639 return false;
640 }
641 }
642 }
643
644 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
645 return true;
646}
647
648bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
649 const MachineOperand &UseMO) const {
650 // Operands of SDWA instructions must be registers.
651 return !TII->isSDWA(MI);
652}
653
654// Find a def of the UseReg, check if it is a reg_sequence and find initializers
655// for each subreg, tracking it to foldable inline immediate if possible.
656// Returns true on success.
657bool SIFoldOperands::getRegSeqInit(
658 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
659 Register UseReg, uint8_t OpTy) const {
660 MachineInstr *Def = MRI->getVRegDef(UseReg);
661 if (!Def || !Def->isRegSequence())
662 return false;
663
664 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
665 MachineOperand *Sub = &Def->getOperand(I);
666 assert(Sub->isReg());
667
668 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
669 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
670 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
671 SubDef = MRI->getVRegDef(Sub->getReg())) {
672 MachineOperand *Op = &SubDef->getOperand(1);
673 if (Op->isImm()) {
674 if (TII->isInlineConstant(*Op, OpTy))
675 Sub = Op;
676 break;
677 }
678 if (!Op->isReg() || Op->getReg().isPhysical())
679 break;
680 Sub = Op;
681 }
682
683 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
684 }
685
686 return true;
687}
688
689bool SIFoldOperands::tryToFoldACImm(
690 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
691 SmallVectorImpl<FoldCandidate> &FoldList) const {
692 const MCInstrDesc &Desc = UseMI->getDesc();
693 if (UseOpIdx >= Desc.getNumOperands())
694 return false;
695
697 return false;
698
699 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
700 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
701 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
702 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
703 return true;
704 }
705
706 if (!OpToFold.isReg())
707 return false;
708
709 Register UseReg = OpToFold.getReg();
710 if (!UseReg.isVirtual())
711 return false;
712
713 if (isUseMIInFoldList(FoldList, UseMI))
714 return false;
715
716 // Maybe it is just a COPY of an immediate itself.
717 MachineInstr *Def = MRI->getVRegDef(UseReg);
718 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
719 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
720 MachineOperand &DefOp = Def->getOperand(1);
721 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
722 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
723 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
724 return true;
725 }
726 }
727
729 if (!getRegSeqInit(Defs, UseReg, OpTy))
730 return false;
731
732 int32_t Imm;
733 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
734 const MachineOperand *Op = Defs[I].first;
735 if (!Op->isImm())
736 return false;
737
738 auto SubImm = Op->getImm();
739 if (!I) {
740 Imm = SubImm;
741 if (!TII->isInlineConstant(*Op, OpTy) ||
742 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
743 return false;
744
745 continue;
746 }
747 if (Imm != SubImm)
748 return false; // Can only fold splat constants
749 }
750
751 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
752 return true;
753}
754
755void SIFoldOperands::foldOperand(
756 MachineOperand &OpToFold,
758 int UseOpIdx,
760 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
761 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
762
763 if (!isUseSafeToFold(*UseMI, *UseOp))
764 return;
765
766 // FIXME: Fold operands with subregs.
767 if (UseOp->isReg() && OpToFold.isReg() &&
768 (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
769 return;
770
771 // Special case for REG_SEQUENCE: We can't fold literals into
772 // REG_SEQUENCE instructions, so we have to fold them into the
773 // uses of REG_SEQUENCE.
774 if (UseMI->isRegSequence()) {
775 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
776 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
777
778 // Grab the use operands first
780 for (auto &Use : MRI->use_nodbg_operands(RegSeqDstReg))
781 UsesToProcess.push_back(&Use);
782 for (auto *RSUse : UsesToProcess) {
783 MachineInstr *RSUseMI = RSUse->getParent();
784
785 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
786 RSUseMI->getOperandNo(RSUse), FoldList))
787 continue;
788
789 if (RSUse->getSubReg() != RegSeqDstSubReg)
790 continue;
791
792 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
793 CopiesToReplace);
794 }
795 return;
796 }
797
798 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
799 return;
800
801 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
802 // Verify that this is a stack access.
803 // FIXME: Should probably use stack pseudos before frame lowering.
804
805 if (TII->isMUBUF(*UseMI)) {
806 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
807 MFI->getScratchRSrcReg())
808 return;
809
810 // Ensure this is either relative to the current frame or the current
811 // wave.
812 MachineOperand &SOff =
813 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
814 if (!SOff.isImm() || SOff.getImm() != 0)
815 return;
816 }
817
818 // A frame index will resolve to a positive constant, so it should always be
819 // safe to fold the addressing mode, even pre-GFX9.
820 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
821
822 const unsigned Opc = UseMI->getOpcode();
823 if (TII->isFLATScratch(*UseMI) &&
824 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
825 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
826 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
827 UseMI->setDesc(TII->get(NewOpc));
828 }
829
830 return;
831 }
832
833 bool FoldingImmLike =
834 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
835
836 if (FoldingImmLike && UseMI->isCopy()) {
837 Register DestReg = UseMI->getOperand(0).getReg();
838 Register SrcReg = UseMI->getOperand(1).getReg();
839 assert(SrcReg.isVirtual());
840
841 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
842
843 // Don't fold into a copy to a physical register with the same class. Doing
844 // so would interfere with the register coalescer's logic which would avoid
845 // redundant initializations.
846 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
847 return;
848
849 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
850 if (!DestReg.isPhysical()) {
851 if (DestRC == &AMDGPU::AGPR_32RegClass &&
852 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
853 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
855 CopiesToReplace.push_back(UseMI);
856 return;
857 }
858 }
859
860 // In order to fold immediates into copies, we need to change the
861 // copy to a MOV.
862
863 unsigned MovOp = TII->getMovOpcode(DestRC);
864 if (MovOp == AMDGPU::COPY)
865 return;
866
869 while (ImpOpI != ImpOpE) {
870 MachineInstr::mop_iterator Tmp = ImpOpI;
871 ImpOpI++;
873 }
874 UseMI->setDesc(TII->get(MovOp));
875
876 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
877 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
878 MachineOperand NewSrcOp(SrcOp);
881 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
882 UseMI->addOperand(NewSrcOp); // src0
883 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
884 UseOpIdx = 2;
885 UseOp = &UseMI->getOperand(UseOpIdx);
886 }
887 CopiesToReplace.push_back(UseMI);
888 } else {
889 if (UseMI->isCopy() && OpToFold.isReg() &&
891 !UseMI->getOperand(1).getSubReg()) {
892 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
893 unsigned Size = TII->getOpSize(*UseMI, 1);
894 Register UseReg = OpToFold.getReg();
896 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
897 UseMI->getOperand(1).setIsKill(false);
898 CopiesToReplace.push_back(UseMI);
899 OpToFold.setIsKill(false);
900
901 // Remove kill flags as kills may now be out of order with uses.
902 MRI->clearKillFlags(OpToFold.getReg());
903
904 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
905 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
906 // its initializers right here, so we will rematerialize immediates and
907 // avoid copies via different reg classes.
909 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
910 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
911 const DebugLoc &DL = UseMI->getDebugLoc();
913
914 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
915 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
917
921 for (unsigned I = 0; I < Size / 4; ++I) {
922 MachineOperand *Def = Defs[I].first;
924 if (Def->isImm() &&
925 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
926 int64_t Imm = Def->getImm();
927
928 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
930 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
931 B.addReg(Tmp);
932 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
933 auto Src = getRegSubRegPair(*Def);
934 Def->setIsKill(false);
935 if (!SeenAGPRs.insert(Src)) {
936 // We cannot build a reg_sequence out of the same registers, they
937 // must be copied. Better do it here before copyPhysReg() created
938 // several reads to do the AGPR->VGPR->AGPR copy.
939 CopyToVGPR = Src;
940 } else {
941 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
942 Src.SubReg);
943 }
944 } else {
945 assert(Def->isReg());
946 Def->setIsKill(false);
947 auto Src = getRegSubRegPair(*Def);
948
949 // Direct copy from SGPR to AGPR is not possible. To avoid creation
950 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
951 // create a copy here and track if we already have such a copy.
952 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
953 CopyToVGPR = Src;
954 } else {
955 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
956 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
957 B.addReg(Tmp);
958 }
959 }
960
961 if (CopyToVGPR.Reg) {
962 Register Vgpr;
963 if (VGPRCopies.count(CopyToVGPR)) {
964 Vgpr = VGPRCopies[CopyToVGPR];
965 } else {
966 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
967 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
968 VGPRCopies[CopyToVGPR] = Vgpr;
969 }
970 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
972 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
973 B.addReg(Tmp);
974 }
975
976 B.addImm(Defs[I].second);
977 }
978 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
979 return;
980 }
981
982 if (Size != 4)
983 return;
984
985 Register Reg0 = UseMI->getOperand(0).getReg();
986 Register Reg1 = UseMI->getOperand(1).getReg();
987 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
988 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
989 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
990 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
991 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
992 TRI->isAGPR(*MRI, Reg1))
993 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
994 return;
995 }
996
997 unsigned UseOpc = UseMI->getOpcode();
998 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
999 (UseOpc == AMDGPU::V_READLANE_B32 &&
1000 (int)UseOpIdx ==
1001 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1002 // %vgpr = V_MOV_B32 imm
1003 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1004 // =>
1005 // %sgpr = S_MOV_B32 imm
1006 if (FoldingImmLike) {
1008 UseMI->getOperand(UseOpIdx).getReg(),
1009 *OpToFold.getParent(),
1010 *UseMI))
1011 return;
1012
1013 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1014
1015 if (OpToFold.isImm())
1016 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1017 else
1019 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1020 return;
1021 }
1022
1023 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1025 UseMI->getOperand(UseOpIdx).getReg(),
1026 *OpToFold.getParent(),
1027 *UseMI))
1028 return;
1029
1030 // %vgpr = COPY %sgpr0
1031 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1032 // =>
1033 // %sgpr1 = COPY %sgpr0
1034 UseMI->setDesc(TII->get(AMDGPU::COPY));
1035 UseMI->getOperand(1).setReg(OpToFold.getReg());
1036 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1037 UseMI->getOperand(1).setIsKill(false);
1038 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1039 return;
1040 }
1041 }
1042
1043 const MCInstrDesc &UseDesc = UseMI->getDesc();
1044
1045 // Don't fold into target independent nodes. Target independent opcodes
1046 // don't have defined register classes.
1047 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1048 UseDesc.operands()[UseOpIdx].RegClass == -1)
1049 return;
1050 }
1051
1052 if (!FoldingImmLike) {
1053 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1054 // Don't fold if OpToFold doesn't hold an aligned register.
1055 const TargetRegisterClass *RC =
1056 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1057 assert(RC);
1058 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
1059 unsigned SubReg = OpToFold.getSubReg();
1060 if (const TargetRegisterClass *SubRC =
1061 TRI->getSubRegisterClass(RC, SubReg))
1062 RC = SubRC;
1063 }
1064
1065 if (!RC || !TRI->isProperlyAlignedRC(*RC))
1066 return;
1067 }
1068
1069 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1070
1071 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1072 // to enable more folding opportunities. The shrink operands pass
1073 // already does this.
1074 return;
1075 }
1076
1077
1078 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
1079 const TargetRegisterClass *FoldRC =
1080 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
1081
1082 // Split 64-bit constants into 32-bits for folding.
1083 if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
1084 Register UseReg = UseOp->getReg();
1085 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
1086 if (AMDGPU::getRegBitWidth(*UseRC) != 64)
1087 return;
1088
1089 APInt Imm(64, OpToFold.getImm());
1090 if (UseOp->getSubReg() == AMDGPU::sub0) {
1091 Imm = Imm.getLoBits(32);
1092 } else {
1093 assert(UseOp->getSubReg() == AMDGPU::sub1);
1094 Imm = Imm.getHiBits(32);
1095 }
1096
1097 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
1098 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
1099 return;
1100 }
1101
1102 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1103}
1104
1105static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1106 uint32_t LHS, uint32_t RHS) {
1107 switch (Opcode) {
1108 case AMDGPU::V_AND_B32_e64:
1109 case AMDGPU::V_AND_B32_e32:
1110 case AMDGPU::S_AND_B32:
1111 Result = LHS & RHS;
1112 return true;
1113 case AMDGPU::V_OR_B32_e64:
1114 case AMDGPU::V_OR_B32_e32:
1115 case AMDGPU::S_OR_B32:
1116 Result = LHS | RHS;
1117 return true;
1118 case AMDGPU::V_XOR_B32_e64:
1119 case AMDGPU::V_XOR_B32_e32:
1120 case AMDGPU::S_XOR_B32:
1121 Result = LHS ^ RHS;
1122 return true;
1123 case AMDGPU::S_XNOR_B32:
1124 Result = ~(LHS ^ RHS);
1125 return true;
1126 case AMDGPU::S_NAND_B32:
1127 Result = ~(LHS & RHS);
1128 return true;
1129 case AMDGPU::S_NOR_B32:
1130 Result = ~(LHS | RHS);
1131 return true;
1132 case AMDGPU::S_ANDN2_B32:
1133 Result = LHS & ~RHS;
1134 return true;
1135 case AMDGPU::S_ORN2_B32:
1136 Result = LHS | ~RHS;
1137 return true;
1138 case AMDGPU::V_LSHL_B32_e64:
1139 case AMDGPU::V_LSHL_B32_e32:
1140 case AMDGPU::S_LSHL_B32:
1141 // The instruction ignores the high bits for out of bounds shifts.
1142 Result = LHS << (RHS & 31);
1143 return true;
1144 case AMDGPU::V_LSHLREV_B32_e64:
1145 case AMDGPU::V_LSHLREV_B32_e32:
1146 Result = RHS << (LHS & 31);
1147 return true;
1148 case AMDGPU::V_LSHR_B32_e64:
1149 case AMDGPU::V_LSHR_B32_e32:
1150 case AMDGPU::S_LSHR_B32:
1151 Result = LHS >> (RHS & 31);
1152 return true;
1153 case AMDGPU::V_LSHRREV_B32_e64:
1154 case AMDGPU::V_LSHRREV_B32_e32:
1155 Result = RHS >> (LHS & 31);
1156 return true;
1157 case AMDGPU::V_ASHR_I32_e64:
1158 case AMDGPU::V_ASHR_I32_e32:
1159 case AMDGPU::S_ASHR_I32:
1160 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1161 return true;
1162 case AMDGPU::V_ASHRREV_I32_e64:
1163 case AMDGPU::V_ASHRREV_I32_e32:
1164 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1165 return true;
1166 default:
1167 return false;
1168 }
1169}
1170
1171static unsigned getMovOpc(bool IsScalar) {
1172 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1173}
1174
1175static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1176 MI.setDesc(NewDesc);
1177
1178 // Remove any leftover implicit operands from mutating the instruction. e.g.
1179 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1180 // anymore.
1181 const MCInstrDesc &Desc = MI.getDesc();
1182 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1183 Desc.implicit_defs().size();
1184
1185 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1186 MI.removeOperand(I);
1187}
1188
1190SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1191 // If this has a subregister, it obviously is a register source.
1192 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1193 !Op.getReg().isVirtual())
1194 return &Op;
1195
1196 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1197 if (Def && Def->isMoveImmediate()) {
1198 MachineOperand &ImmSrc = Def->getOperand(1);
1199 if (ImmSrc.isImm())
1200 return &ImmSrc;
1201 }
1202
1203 return &Op;
1204}
1205
1206// Try to simplify operations with a constant that may appear after instruction
1207// selection.
1208// TODO: See if a frame index with a fixed offset can fold.
1209bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1210 if (!MI->allImplicitDefsAreDead())
1211 return false;
1212
1213 unsigned Opc = MI->getOpcode();
1214
1215 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1216 if (Src0Idx == -1)
1217 return false;
1218 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1219
1220 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1221 Opc == AMDGPU::S_NOT_B32) &&
1222 Src0->isImm()) {
1223 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1224 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1225 return true;
1226 }
1227
1228 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1229 if (Src1Idx == -1)
1230 return false;
1231 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1232
1233 if (!Src0->isImm() && !Src1->isImm())
1234 return false;
1235
1236 // and k0, k1 -> v_mov_b32 (k0 & k1)
1237 // or k0, k1 -> v_mov_b32 (k0 | k1)
1238 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1239 if (Src0->isImm() && Src1->isImm()) {
1240 int32_t NewImm;
1241 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1242 return false;
1243
1244 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1245
1246 // Be careful to change the right operand, src0 may belong to a different
1247 // instruction.
1248 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1249 MI->removeOperand(Src1Idx);
1250 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1251 return true;
1252 }
1253
1254 if (!MI->isCommutable())
1255 return false;
1256
1257 if (Src0->isImm() && !Src1->isImm()) {
1258 std::swap(Src0, Src1);
1259 std::swap(Src0Idx, Src1Idx);
1260 }
1261
1262 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1263 if (Opc == AMDGPU::V_OR_B32_e64 ||
1264 Opc == AMDGPU::V_OR_B32_e32 ||
1265 Opc == AMDGPU::S_OR_B32) {
1266 if (Src1Val == 0) {
1267 // y = or x, 0 => y = copy x
1268 MI->removeOperand(Src1Idx);
1269 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1270 } else if (Src1Val == -1) {
1271 // y = or x, -1 => y = v_mov_b32 -1
1272 MI->removeOperand(Src1Idx);
1273 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1274 } else
1275 return false;
1276
1277 return true;
1278 }
1279
1280 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1281 Opc == AMDGPU::S_AND_B32) {
1282 if (Src1Val == 0) {
1283 // y = and x, 0 => y = v_mov_b32 0
1284 MI->removeOperand(Src0Idx);
1285 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1286 } else if (Src1Val == -1) {
1287 // y = and x, -1 => y = copy x
1288 MI->removeOperand(Src1Idx);
1289 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1290 } else
1291 return false;
1292
1293 return true;
1294 }
1295
1296 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1297 Opc == AMDGPU::S_XOR_B32) {
1298 if (Src1Val == 0) {
1299 // y = xor x, 0 => y = copy x
1300 MI->removeOperand(Src1Idx);
1301 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1302 return true;
1303 }
1304 }
1305
1306 return false;
1307}
1308
1309// Try to fold an instruction into a simpler one
1310bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1311 unsigned Opc = MI.getOpcode();
1312 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1313 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1314 return false;
1315
1316 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1317 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1318 if (!Src1->isIdenticalTo(*Src0)) {
1319 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1320 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1321 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1322 return false;
1323 }
1324
1325 int Src1ModIdx =
1326 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1327 int Src0ModIdx =
1328 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1329 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1330 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1331 return false;
1332
1333 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1334 auto &NewDesc =
1335 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1336 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1337 if (Src2Idx != -1)
1338 MI.removeOperand(Src2Idx);
1339 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1340 if (Src1ModIdx != -1)
1341 MI.removeOperand(Src1ModIdx);
1342 if (Src0ModIdx != -1)
1343 MI.removeOperand(Src0ModIdx);
1344 mutateCopyOp(MI, NewDesc);
1345 LLVM_DEBUG(dbgs() << MI);
1346 return true;
1347}
1348
1349bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1350 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1351 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1352 return false;
1353
1354 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1355 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1356 return false;
1357
1358 Register Src1 = MI.getOperand(2).getReg();
1359 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1360 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1361 return false;
1362
1363 Register Dst = MI.getOperand(0).getReg();
1364 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
1365 MI.eraseFromParent();
1366 return true;
1367}
1368
1369bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1370 MachineOperand &OpToFold) const {
1371 // We need mutate the operands of new mov instructions to add implicit
1372 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1373 // this.
1374 SmallVector<MachineInstr *, 4> CopiesToReplace;
1376 MachineOperand &Dst = MI.getOperand(0);
1377 bool Changed = false;
1378
1379 if (OpToFold.isImm()) {
1380 for (auto &UseMI :
1381 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1382 // Folding the immediate may reveal operations that can be constant
1383 // folded or replaced with a copy. This can happen for example after
1384 // frame indices are lowered to constants or from splitting 64-bit
1385 // constants.
1386 //
1387 // We may also encounter cases where one or both operands are
1388 // immediates materialized into a register, which would ordinarily not
1389 // be folded due to multiple uses or operand constraints.
1390 if (tryConstantFoldOp(&UseMI)) {
1391 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1392 Changed = true;
1393 }
1394 }
1395 }
1396
1398 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1399 UsesToProcess.push_back(&Use);
1400 for (auto *U : UsesToProcess) {
1401 MachineInstr *UseMI = U->getParent();
1402 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1403 CopiesToReplace);
1404 }
1405
1406 if (CopiesToReplace.empty() && FoldList.empty())
1407 return Changed;
1408
1409 MachineFunction *MF = MI.getParent()->getParent();
1410 // Make sure we add EXEC uses to any new v_mov instructions created.
1411 for (MachineInstr *Copy : CopiesToReplace)
1412 Copy->addImplicitDefUseOperands(*MF);
1413
1414 for (FoldCandidate &Fold : FoldList) {
1415 assert(!Fold.isReg() || Fold.OpToFold);
1416 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1417 Register Reg = Fold.OpToFold->getReg();
1418 MachineInstr *DefMI = Fold.OpToFold->getParent();
1419 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1420 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1421 continue;
1422 }
1423 if (updateOperand(Fold)) {
1424 // Clear kill flags.
1425 if (Fold.isReg()) {
1426 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1427 // FIXME: Probably shouldn't bother trying to fold if not an
1428 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1429 // copies.
1430 MRI->clearKillFlags(Fold.OpToFold->getReg());
1431 }
1432 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1433 << static_cast<int>(Fold.UseOpNo) << " of "
1434 << *Fold.UseMI);
1435 } else if (Fold.Commuted) {
1436 // Restoring instruction's original operand order if fold has failed.
1437 TII->commuteInstruction(*Fold.UseMI, false);
1438 }
1439 }
1440 return true;
1441}
1442
1443bool SIFoldOperands::tryFoldFoldableCopy(
1444 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1445 // Specially track simple redefs of m0 to the same value in a block, so we
1446 // can erase the later ones.
1447 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1448 MachineOperand &NewM0Val = MI.getOperand(1);
1449 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1450 MI.eraseFromParent();
1451 return true;
1452 }
1453
1454 // We aren't tracking other physical registers
1455 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1456 ? nullptr
1457 : &NewM0Val;
1458 return false;
1459 }
1460
1461 MachineOperand &OpToFold = MI.getOperand(1);
1462 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1463
1464 // FIXME: We could also be folding things like TargetIndexes.
1465 if (!FoldingImm && !OpToFold.isReg())
1466 return false;
1467
1468 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1469 return false;
1470
1471 // Prevent folding operands backwards in the function. For example,
1472 // the COPY opcode must not be replaced by 1 in this example:
1473 //
1474 // %3 = COPY %vgpr0; VGPR_32:%3
1475 // ...
1476 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1477 if (!MI.getOperand(0).getReg().isVirtual())
1478 return false;
1479
1480 bool Changed = foldInstOperand(MI, OpToFold);
1481
1482 // If we managed to fold all uses of this copy then we might as well
1483 // delete it now.
1484 // The only reason we need to follow chains of copies here is that
1485 // tryFoldRegSequence looks forward through copies before folding a
1486 // REG_SEQUENCE into its eventual users.
1487 auto *InstToErase = &MI;
1488 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1489 auto &SrcOp = InstToErase->getOperand(1);
1490 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1491 InstToErase->eraseFromParent();
1492 Changed = true;
1493 InstToErase = nullptr;
1494 if (!SrcReg || SrcReg.isPhysical())
1495 break;
1496 InstToErase = MRI->getVRegDef(SrcReg);
1497 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1498 break;
1499 }
1500
1501 if (InstToErase && InstToErase->isRegSequence() &&
1502 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1503 InstToErase->eraseFromParent();
1504 Changed = true;
1505 }
1506
1507 return Changed;
1508}
1509
1510// Clamp patterns are canonically selected to v_max_* instructions, so only
1511// handle them.
1512const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1513 unsigned Op = MI.getOpcode();
1514 switch (Op) {
1515 case AMDGPU::V_MAX_F32_e64:
1516 case AMDGPU::V_MAX_F16_e64:
1517 case AMDGPU::V_MAX_F16_t16_e64:
1518 case AMDGPU::V_MAX_F16_fake16_e64:
1519 case AMDGPU::V_MAX_F64_e64:
1520 case AMDGPU::V_MAX_NUM_F64_e64:
1521 case AMDGPU::V_PK_MAX_F16: {
1522 if (MI.mayRaiseFPException())
1523 return nullptr;
1524
1525 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1526 return nullptr;
1527
1528 // Make sure sources are identical.
1529 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1530 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1531 if (!Src0->isReg() || !Src1->isReg() ||
1532 Src0->getReg() != Src1->getReg() ||
1533 Src0->getSubReg() != Src1->getSubReg() ||
1534 Src0->getSubReg() != AMDGPU::NoSubRegister)
1535 return nullptr;
1536
1537 // Can't fold up if we have modifiers.
1538 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1539 return nullptr;
1540
1541 unsigned Src0Mods
1542 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1543 unsigned Src1Mods
1544 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1545
1546 // Having a 0 op_sel_hi would require swizzling the output in the source
1547 // instruction, which we can't do.
1548 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1549 : 0u;
1550 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1551 return nullptr;
1552 return Src0;
1553 }
1554 default:
1555 return nullptr;
1556 }
1557}
1558
1559// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1560bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1561 const MachineOperand *ClampSrc = isClamp(MI);
1562 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1563 return false;
1564
1565 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1566
1567 // The type of clamp must be compatible.
1568 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1569 return false;
1570
1571 if (Def->mayRaiseFPException())
1572 return false;
1573
1574 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1575 if (!DefClamp)
1576 return false;
1577
1578 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1579
1580 // Clamp is applied after omod, so it is OK if omod is set.
1581 DefClamp->setImm(1);
1582 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1583 MI.eraseFromParent();
1584
1585 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1586 // instruction, so we might as well convert it to the more flexible VOP3-only
1587 // mad/fma form.
1588 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1589 Def->eraseFromParent();
1590
1591 return true;
1592}
1593
1594static int getOModValue(unsigned Opc, int64_t Val) {
1595 switch (Opc) {
1596 case AMDGPU::V_MUL_F64_e64:
1597 case AMDGPU::V_MUL_F64_pseudo_e64: {
1598 switch (Val) {
1599 case 0x3fe0000000000000: // 0.5
1600 return SIOutMods::DIV2;
1601 case 0x4000000000000000: // 2.0
1602 return SIOutMods::MUL2;
1603 case 0x4010000000000000: // 4.0
1604 return SIOutMods::MUL4;
1605 default:
1606 return SIOutMods::NONE;
1607 }
1608 }
1609 case AMDGPU::V_MUL_F32_e64: {
1610 switch (static_cast<uint32_t>(Val)) {
1611 case 0x3f000000: // 0.5
1612 return SIOutMods::DIV2;
1613 case 0x40000000: // 2.0
1614 return SIOutMods::MUL2;
1615 case 0x40800000: // 4.0
1616 return SIOutMods::MUL4;
1617 default:
1618 return SIOutMods::NONE;
1619 }
1620 }
1621 case AMDGPU::V_MUL_F16_e64:
1622 case AMDGPU::V_MUL_F16_t16_e64:
1623 case AMDGPU::V_MUL_F16_fake16_e64: {
1624 switch (static_cast<uint16_t>(Val)) {
1625 case 0x3800: // 0.5
1626 return SIOutMods::DIV2;
1627 case 0x4000: // 2.0
1628 return SIOutMods::MUL2;
1629 case 0x4400: // 4.0
1630 return SIOutMods::MUL4;
1631 default:
1632 return SIOutMods::NONE;
1633 }
1634 }
1635 default:
1636 llvm_unreachable("invalid mul opcode");
1637 }
1638}
1639
1640// FIXME: Does this really not support denormals with f16?
1641// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1642// handled, so will anything other than that break?
1643std::pair<const MachineOperand *, int>
1644SIFoldOperands::isOMod(const MachineInstr &MI) const {
1645 unsigned Op = MI.getOpcode();
1646 switch (Op) {
1647 case AMDGPU::V_MUL_F64_e64:
1648 case AMDGPU::V_MUL_F64_pseudo_e64:
1649 case AMDGPU::V_MUL_F32_e64:
1650 case AMDGPU::V_MUL_F16_t16_e64:
1651 case AMDGPU::V_MUL_F16_fake16_e64:
1652 case AMDGPU::V_MUL_F16_e64: {
1653 // If output denormals are enabled, omod is ignored.
1654 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1655 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1656 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1657 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
1658 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1659 MFI->getMode().FP64FP16Denormals.Output !=
1661 MI.mayRaiseFPException())
1662 return std::pair(nullptr, SIOutMods::NONE);
1663
1664 const MachineOperand *RegOp = nullptr;
1665 const MachineOperand *ImmOp = nullptr;
1666 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1667 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1668 if (Src0->isImm()) {
1669 ImmOp = Src0;
1670 RegOp = Src1;
1671 } else if (Src1->isImm()) {
1672 ImmOp = Src1;
1673 RegOp = Src0;
1674 } else
1675 return std::pair(nullptr, SIOutMods::NONE);
1676
1677 int OMod = getOModValue(Op, ImmOp->getImm());
1678 if (OMod == SIOutMods::NONE ||
1679 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1680 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1681 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1682 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1683 return std::pair(nullptr, SIOutMods::NONE);
1684
1685 return std::pair(RegOp, OMod);
1686 }
1687 case AMDGPU::V_ADD_F64_e64:
1688 case AMDGPU::V_ADD_F64_pseudo_e64:
1689 case AMDGPU::V_ADD_F32_e64:
1690 case AMDGPU::V_ADD_F16_e64:
1691 case AMDGPU::V_ADD_F16_t16_e64:
1692 case AMDGPU::V_ADD_F16_fake16_e64: {
1693 // If output denormals are enabled, omod is ignored.
1694 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1695 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1696 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1697 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
1698 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1699 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1700 return std::pair(nullptr, SIOutMods::NONE);
1701
1702 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1703 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1704 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1705
1706 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1707 Src0->getSubReg() == Src1->getSubReg() &&
1708 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1709 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1710 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1711 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1712 return std::pair(Src0, SIOutMods::MUL2);
1713
1714 return std::pair(nullptr, SIOutMods::NONE);
1715 }
1716 default:
1717 return std::pair(nullptr, SIOutMods::NONE);
1718 }
1719}
1720
1721// FIXME: Does this need to check IEEE bit on function?
1722bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1723 const MachineOperand *RegOp;
1724 int OMod;
1725 std::tie(RegOp, OMod) = isOMod(MI);
1726 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1727 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1728 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1729 return false;
1730
1731 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1732 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1733 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1734 return false;
1735
1736 if (Def->mayRaiseFPException())
1737 return false;
1738
1739 // Clamp is applied after omod. If the source already has clamp set, don't
1740 // fold it.
1741 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1742 return false;
1743
1744 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1745
1746 DefOMod->setImm(OMod);
1747 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1748 MI.eraseFromParent();
1749
1750 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1751 // instruction, so we might as well convert it to the more flexible VOP3-only
1752 // mad/fma form.
1753 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1754 Def->eraseFromParent();
1755
1756 return true;
1757}
1758
1759// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1760// instruction which can take an agpr. So far that means a store.
1761bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1762 assert(MI.isRegSequence());
1763 auto Reg = MI.getOperand(0).getReg();
1764
1765 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1766 !MRI->hasOneNonDBGUse(Reg))
1767 return false;
1768
1770 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1771 return false;
1772
1773 for (auto &Def : Defs) {
1774 const auto *Op = Def.first;
1775 if (!Op->isReg())
1776 return false;
1777 if (TRI->isAGPR(*MRI, Op->getReg()))
1778 continue;
1779 // Maybe this is a COPY from AREG
1780 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1781 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1782 return false;
1783 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1784 return false;
1785 }
1786
1787 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1788 MachineInstr *UseMI = Op->getParent();
1789 while (UseMI->isCopy() && !Op->getSubReg()) {
1790 Reg = UseMI->getOperand(0).getReg();
1791 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1792 return false;
1793 Op = &*MRI->use_nodbg_begin(Reg);
1794 UseMI = Op->getParent();
1795 }
1796
1797 if (Op->getSubReg())
1798 return false;
1799
1800 unsigned OpIdx = Op - &UseMI->getOperand(0);
1801 const MCInstrDesc &InstDesc = UseMI->getDesc();
1802 const TargetRegisterClass *OpRC =
1803 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1804 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1805 return false;
1806
1807 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1808 auto Dst = MRI->createVirtualRegister(NewDstRC);
1809 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1810 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1811
1812 for (unsigned I = 0; I < Defs.size(); ++I) {
1813 MachineOperand *Def = Defs[I].first;
1814 Def->setIsKill(false);
1815 if (TRI->isAGPR(*MRI, Def->getReg())) {
1816 RS.add(*Def);
1817 } else { // This is a copy
1818 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1819 SubDef->getOperand(1).setIsKill(false);
1820 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1821 }
1822 RS.addImm(Defs[I].second);
1823 }
1824
1825 Op->setReg(Dst);
1826 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1827 Op->setReg(Reg);
1828 RS->eraseFromParent();
1829 return false;
1830 }
1831
1832 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1833
1834 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1835 // in which case we can erase them all later in runOnMachineFunction.
1836 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1837 MI.eraseFromParent();
1838 return true;
1839}
1840
1841/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1842/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1843static bool isAGPRCopy(const SIRegisterInfo &TRI,
1844 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1845 Register &OutReg, unsigned &OutSubReg) {
1846 assert(Copy.isCopy());
1847
1848 const MachineOperand &CopySrc = Copy.getOperand(1);
1849 Register CopySrcReg = CopySrc.getReg();
1850 if (!CopySrcReg.isVirtual())
1851 return false;
1852
1853 // Common case: copy from AGPR directly, e.g.
1854 // %1:vgpr_32 = COPY %0:agpr_32
1855 if (TRI.isAGPR(MRI, CopySrcReg)) {
1856 OutReg = CopySrcReg;
1857 OutSubReg = CopySrc.getSubReg();
1858 return true;
1859 }
1860
1861 // Sometimes it can also involve two copies, e.g.
1862 // %1:vgpr_256 = COPY %0:agpr_256
1863 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
1864 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
1865 if (!CopySrcDef || !CopySrcDef->isCopy())
1866 return false;
1867
1868 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
1869 Register OtherCopySrcReg = OtherCopySrc.getReg();
1870 if (!OtherCopySrcReg.isVirtual() ||
1871 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
1872 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
1873 !TRI.isAGPR(MRI, OtherCopySrcReg))
1874 return false;
1875
1876 OutReg = OtherCopySrcReg;
1877 OutSubReg = CopySrc.getSubReg();
1878 return true;
1879}
1880
1881// Try to hoist an AGPR to VGPR copy across a PHI.
1882// This should allow folding of an AGPR into a consumer which may support it.
1883//
1884// Example 1: LCSSA PHI
1885// loop:
1886// %1:vreg = COPY %0:areg
1887// exit:
1888// %2:vreg = PHI %1:vreg, %loop
1889// =>
1890// loop:
1891// exit:
1892// %1:areg = PHI %0:areg, %loop
1893// %2:vreg = COPY %1:areg
1894//
1895// Example 2: PHI with multiple incoming values:
1896// entry:
1897// %1:vreg = GLOBAL_LOAD(..)
1898// loop:
1899// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1900// %3:areg = COPY %2:vreg
1901// %4:areg = (instr using %3:areg)
1902// %5:vreg = COPY %4:areg
1903// =>
1904// entry:
1905// %1:vreg = GLOBAL_LOAD(..)
1906// %2:areg = COPY %1:vreg
1907// loop:
1908// %3:areg = PHI %2:areg, %entry, %X:areg,
1909// %4:areg = (instr using %3:areg)
1910bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1911 assert(PHI.isPHI());
1912
1913 Register PhiOut = PHI.getOperand(0).getReg();
1914 if (!TRI->isVGPR(*MRI, PhiOut))
1915 return false;
1916
1917 // Iterate once over all incoming values of the PHI to check if this PHI is
1918 // eligible, and determine the exact AGPR RC we'll target.
1919 const TargetRegisterClass *ARC = nullptr;
1920 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1921 MachineOperand &MO = PHI.getOperand(K);
1922 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
1923 if (!Copy || !Copy->isCopy())
1924 continue;
1925
1926 Register AGPRSrc;
1927 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1928 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
1929 continue;
1930
1931 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
1932 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1933 CopyInRC = SubRC;
1934
1935 if (ARC && !ARC->hasSubClassEq(CopyInRC))
1936 return false;
1937 ARC = CopyInRC;
1938 }
1939
1940 if (!ARC)
1941 return false;
1942
1943 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1944
1945 // Rewrite the PHI's incoming values to ARC.
1946 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1947 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1948 MachineOperand &MO = PHI.getOperand(K);
1949 Register Reg = MO.getReg();
1950
1952 MachineBasicBlock *InsertMBB = nullptr;
1953
1954 // Look at the def of Reg, ignoring all copies.
1955 unsigned CopyOpc = AMDGPU::COPY;
1956 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1957
1958 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1959 // the copy was single-use, it will be removed by DCE later.
1960 if (Def->isCopy()) {
1961 Register AGPRSrc;
1962 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1963 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
1964 MO.setReg(AGPRSrc);
1965 MO.setSubReg(AGPRSubReg);
1966 continue;
1967 }
1968
1969 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1970 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1971 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1972 // is unlikely to be profitable.
1973 //
1974 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1975 MachineOperand &CopyIn = Def->getOperand(1);
1976 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1977 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1978 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1979 }
1980
1981 InsertMBB = Def->getParent();
1982 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
1983 } else {
1984 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1985 InsertPt = InsertMBB->getFirstTerminator();
1986 }
1987
1988 Register NewReg = MRI->createVirtualRegister(ARC);
1989 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
1990 TII->get(CopyOpc), NewReg)
1991 .addReg(Reg);
1992 MO.setReg(NewReg);
1993
1994 (void)MI;
1995 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
1996 }
1997
1998 // Replace the PHI's result with a new register.
1999 Register NewReg = MRI->createVirtualRegister(ARC);
2000 PHI.getOperand(0).setReg(NewReg);
2001
2002 // COPY that new register back to the original PhiOut register. This COPY will
2003 // usually be folded out later.
2004 MachineBasicBlock *MBB = PHI.getParent();
2005 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2006 TII->get(AMDGPU::COPY), PhiOut)
2007 .addReg(NewReg);
2008
2009 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2010 return true;
2011}
2012
2013// Attempt to convert VGPR load to an AGPR load.
2014bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
2015 assert(MI.mayLoad());
2016 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2017 return false;
2018
2019 MachineOperand &Def = MI.getOperand(0);
2020 if (!Def.isDef())
2021 return false;
2022
2023 Register DefReg = Def.getReg();
2024
2025 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2026 return false;
2027
2029 SmallVector<Register, 8> MoveRegs;
2030 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
2031 Users.push_back(&I);
2032
2033 if (Users.empty())
2034 return false;
2035
2036 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2037 while (!Users.empty()) {
2038 const MachineInstr *I = Users.pop_back_val();
2039 if (!I->isCopy() && !I->isRegSequence())
2040 return false;
2041 Register DstReg = I->getOperand(0).getReg();
2042 // Physical registers may have more than one instruction definitions
2043 if (DstReg.isPhysical())
2044 return false;
2045 if (TRI->isAGPR(*MRI, DstReg))
2046 continue;
2047 MoveRegs.push_back(DstReg);
2048 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2049 Users.push_back(&U);
2050 }
2051
2052 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2053 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2054 if (!TII->isOperandLegal(MI, 0, &Def)) {
2055 MRI->setRegClass(DefReg, RC);
2056 return false;
2057 }
2058
2059 while (!MoveRegs.empty()) {
2060 Register Reg = MoveRegs.pop_back_val();
2061 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2062 }
2063
2064 LLVM_DEBUG(dbgs() << "Folded " << MI);
2065
2066 return true;
2067}
2068
2069// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2070// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2071// there's cases where it can create a lot more AGPR-AGPR copies, which are
2072// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2073//
2074// This function looks at all AGPR PHIs in a basic block and collects their
2075// operands. Then, it checks for register that are used more than once across
2076// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2077// having to create one VGPR temporary per use, which can get very messy if
2078// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2079// element).
2080//
2081// Example
2082// a:
2083// %in:agpr_256 = COPY %foo:vgpr_256
2084// c:
2085// %x:agpr_32 = ..
2086// b:
2087// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2088// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2089// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2090// =>
2091// a:
2092// %in:agpr_256 = COPY %foo:vgpr_256
2093// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2094// %tmp_agpr:agpr_32 = COPY %tmp
2095// c:
2096// %x:agpr_32 = ..
2097// b:
2098// %0:areg = PHI %tmp_agpr, %a, %x, %c
2099// %1:areg = PHI %tmp_agpr, %a, %y, %c
2100// %2:areg = PHI %tmp_agpr, %a, %z, %c
2101bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2102 // This is only really needed on GFX908 where AGPR-AGPR copies are
2103 // unreasonably difficult.
2104 if (ST->hasGFX90AInsts())
2105 return false;
2106
2107 // Look at all AGPR Phis and collect the register + subregister used.
2108 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2109 RegToMO;
2110
2111 for (auto &MI : MBB) {
2112 if (!MI.isPHI())
2113 break;
2114
2115 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2116 continue;
2117
2118 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2119 MachineOperand &PhiMO = MI.getOperand(K);
2120 if (!PhiMO.getSubReg())
2121 continue;
2122 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2123 }
2124 }
2125
2126 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2127 // a VGPR.
2128 bool Changed = false;
2129 for (const auto &[Entry, MOs] : RegToMO) {
2130 if (MOs.size() == 1)
2131 continue;
2132
2133 const auto [Reg, SubReg] = Entry;
2134 MachineInstr *Def = MRI->getVRegDef(Reg);
2135 MachineBasicBlock *DefMBB = Def->getParent();
2136
2137 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2138 // out.
2139 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2140 Register TempVGPR =
2141 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2142 MachineInstr *VGPRCopy =
2143 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2144 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2145 .addReg(Reg, /* flags */ 0, SubReg);
2146
2147 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2148 Register TempAGPR = MRI->createVirtualRegister(ARC);
2149 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2150 TII->get(AMDGPU::COPY), TempAGPR)
2151 .addReg(TempVGPR);
2152
2153 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2154 for (MachineOperand *MO : MOs) {
2155 MO->setReg(TempAGPR);
2156 MO->setSubReg(AMDGPU::NoSubRegister);
2157 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2158 }
2159
2160 Changed = true;
2161 }
2162
2163 return Changed;
2164}
2165
2166bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
2167 if (skipFunction(MF.getFunction()))
2168 return false;
2169
2170 MRI = &MF.getRegInfo();
2171 ST = &MF.getSubtarget<GCNSubtarget>();
2172 TII = ST->getInstrInfo();
2173 TRI = &TII->getRegisterInfo();
2174 MFI = MF.getInfo<SIMachineFunctionInfo>();
2175
2176 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2177 // correctly handle signed zeros.
2178 //
2179 // FIXME: Also need to check strictfp
2180 bool IsIEEEMode = MFI->getMode().IEEE;
2181 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2182
2183 bool Changed = false;
2184 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2185 MachineOperand *CurrentKnownM0Val = nullptr;
2186 for (auto &MI : make_early_inc_range(*MBB)) {
2187 Changed |= tryFoldCndMask(MI);
2188
2189 if (tryFoldZeroHighBits(MI)) {
2190 Changed = true;
2191 continue;
2192 }
2193
2194 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2195 Changed = true;
2196 continue;
2197 }
2198
2199 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2200 Changed = true;
2201 continue;
2202 }
2203
2204 if (MI.mayLoad() && tryFoldLoad(MI)) {
2205 Changed = true;
2206 continue;
2207 }
2208
2209 if (TII->isFoldableCopy(MI)) {
2210 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2211 continue;
2212 }
2213
2214 // Saw an unknown clobber of m0, so we no longer know what it is.
2215 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2216 CurrentKnownM0Val = nullptr;
2217
2218 // TODO: Omod might be OK if there is NSZ only on the source
2219 // instruction, and not the omod multiply.
2220 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2221 !tryFoldOMod(MI))
2222 Changed |= tryFoldClamp(MI);
2223 }
2224
2225 Changed |= tryOptimizeAGPRPhis(*MBB);
2226 }
2227
2228 return Changed;
2229}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
aarch64 promote const
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
Module * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
#define DEBUG_TYPE
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
Class for arbitrary precision integers.
Definition: APInt.h:77
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:261
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:775
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:566
bool isRegSequence() const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:699
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo)
Does this operand support only inlinable literals?
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition: COFF.h:811
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1415
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
char & SIFoldOperandsID
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.