LLVM 19.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "AMDGPU.h"
12#include "GCNSubtarget.h"
18
19#define DEBUG_TYPE "si-fold-operands"
20using namespace llvm;
21
22namespace {
23
24struct FoldCandidate {
26 union {
27 MachineOperand *OpToFold;
28 uint64_t ImmToFold;
29 int FrameIndexToFold;
30 };
31 int ShrinkOpcode;
32 unsigned UseOpNo;
34 bool Commuted;
35
36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37 bool Commuted_ = false,
38 int ShrinkOp = -1) :
39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40 Kind(FoldOp->getType()),
41 Commuted(Commuted_) {
42 if (FoldOp->isImm()) {
43 ImmToFold = FoldOp->getImm();
44 } else if (FoldOp->isFI()) {
45 FrameIndexToFold = FoldOp->getIndex();
46 } else {
47 assert(FoldOp->isReg() || FoldOp->isGlobal());
48 OpToFold = FoldOp;
49 }
50 }
51
52 bool isFI() const {
53 return Kind == MachineOperand::MO_FrameIndex;
54 }
55
56 bool isImm() const {
57 return Kind == MachineOperand::MO_Immediate;
58 }
59
60 bool isReg() const {
61 return Kind == MachineOperand::MO_Register;
62 }
63
64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65
66 bool needsShrink() const { return ShrinkOpcode != -1; }
67};
68
69class SIFoldOperands : public MachineFunctionPass {
70public:
71 static char ID;
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 bool updateOperand(FoldCandidate &Fold) const;
82
83 bool canUseImmWithOpSel(FoldCandidate &Fold) const;
84
85 bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
86
87 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
88 MachineInstr *MI, unsigned OpNo,
89 MachineOperand *OpToFold) const;
90 bool isUseSafeToFold(const MachineInstr &MI,
91 const MachineOperand &UseMO) const;
92 bool
93 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
94 Register UseReg, uint8_t OpTy) const;
95 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
96 unsigned UseOpIdx,
97 SmallVectorImpl<FoldCandidate> &FoldList) const;
98 void foldOperand(MachineOperand &OpToFold,
100 int UseOpIdx,
102 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
103
104 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
105 bool tryConstantFoldOp(MachineInstr *MI) const;
106 bool tryFoldCndMask(MachineInstr &MI) const;
107 bool tryFoldZeroHighBits(MachineInstr &MI) const;
108 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
109 bool tryFoldFoldableCopy(MachineInstr &MI,
110 MachineOperand *&CurrentKnownM0Val) const;
111
112 const MachineOperand *isClamp(const MachineInstr &MI) const;
113 bool tryFoldClamp(MachineInstr &MI);
114
115 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
116 bool tryFoldOMod(MachineInstr &MI);
117 bool tryFoldRegSequence(MachineInstr &MI);
118 bool tryFoldPhiAGPR(MachineInstr &MI);
119 bool tryFoldLoad(MachineInstr &MI);
120
121 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
122
123public:
124 SIFoldOperands() : MachineFunctionPass(ID) {
126 }
127
128 bool runOnMachineFunction(MachineFunction &MF) override;
129
130 StringRef getPassName() const override { return "SI Fold Operands"; }
131
132 void getAnalysisUsage(AnalysisUsage &AU) const override {
133 AU.setPreservesCFG();
135 }
136};
137
138} // End anonymous namespace.
139
140INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
141 "SI Fold Operands", false, false)
142
143char SIFoldOperands::ID = 0;
144
145char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
146
149 const MachineOperand &MO) {
150 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
151 if (const TargetRegisterClass *SubRC =
152 TRI.getSubRegisterClass(RC, MO.getSubReg()))
153 RC = SubRC;
154 return RC;
155}
156
157// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
158static unsigned macToMad(unsigned Opc) {
159 switch (Opc) {
160 case AMDGPU::V_MAC_F32_e64:
161 return AMDGPU::V_MAD_F32_e64;
162 case AMDGPU::V_MAC_F16_e64:
163 return AMDGPU::V_MAD_F16_e64;
164 case AMDGPU::V_FMAC_F32_e64:
165 return AMDGPU::V_FMA_F32_e64;
166 case AMDGPU::V_FMAC_F16_e64:
167 return AMDGPU::V_FMA_F16_gfx9_e64;
168 case AMDGPU::V_FMAC_F16_t16_e64:
169 return AMDGPU::V_FMA_F16_gfx9_e64;
170 case AMDGPU::V_FMAC_LEGACY_F32_e64:
171 return AMDGPU::V_FMA_LEGACY_F32_e64;
172 case AMDGPU::V_FMAC_F64_e64:
173 return AMDGPU::V_FMA_F64_e64;
174 }
175 return AMDGPU::INSTRUCTION_LIST_END;
176}
177
178// TODO: Add heuristic that the frame index might not fit in the addressing mode
179// immediate offset to avoid materializing in loops.
180bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
181 const MachineOperand &OpToFold) const {
182 if (!OpToFold.isFI())
183 return false;
184
185 const unsigned Opc = UseMI.getOpcode();
186 if (TII->isMUBUF(UseMI))
187 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
188 if (!TII->isFLATScratch(UseMI))
189 return false;
190
191 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
192 if (OpNo == SIdx)
193 return true;
194
195 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
196 return OpNo == VIdx && SIdx == -1;
197}
198
200 return new SIFoldOperands();
201}
202
203bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
204 MachineInstr *MI = Fold.UseMI;
205 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
206 const uint64_t TSFlags = MI->getDesc().TSFlags;
207
208 assert(Old.isReg() && Fold.isImm());
209
210 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
211 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
212 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
213 return false;
214
215 unsigned Opcode = MI->getOpcode();
216 int OpNo = MI->getOperandNo(&Old);
217 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
218 switch (OpType) {
219 default:
220 return false;
227 break;
228 }
229
230 return true;
231}
232
233bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
234 MachineInstr *MI = Fold.UseMI;
235 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
236 unsigned Opcode = MI->getOpcode();
237 int OpNo = MI->getOperandNo(&Old);
238 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
239
240 // If the literal can be inlined as-is, apply it and short-circuit the
241 // tests below. The main motivation for this is to avoid unintuitive
242 // uses of opsel.
243 if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
244 Old.ChangeToImmediate(Fold.ImmToFold);
245 return true;
246 }
247
248 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
249 // op_sel in a way that allows an inline constant.
250 int ModIdx = -1;
251 unsigned SrcIdx = ~0;
252 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
253 ModIdx = AMDGPU::OpName::src0_modifiers;
254 SrcIdx = 0;
255 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
256 ModIdx = AMDGPU::OpName::src1_modifiers;
257 SrcIdx = 1;
258 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
259 ModIdx = AMDGPU::OpName::src2_modifiers;
260 SrcIdx = 2;
261 }
262 assert(ModIdx != -1);
263 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
264 MachineOperand &Mod = MI->getOperand(ModIdx);
265 unsigned ModVal = Mod.getImm();
266
267 uint16_t ImmLo = static_cast<uint16_t>(
268 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
269 uint16_t ImmHi = static_cast<uint16_t>(
270 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
271 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
272 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
273
274 // Helper function that attempts to inline the given value with a newly
275 // chosen opsel pattern.
276 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
277 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
278 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
279 Old.ChangeToImmediate(Imm);
280 return true;
281 }
282
283 // Try to shuffle the halves around and leverage opsel to get an inline
284 // constant.
285 uint16_t Lo = static_cast<uint16_t>(Imm);
286 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
287 if (Lo == Hi) {
288 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
289 Mod.setImm(NewModVal);
291 return true;
292 }
293
294 if (static_cast<int16_t>(Lo) < 0) {
295 int32_t SExt = static_cast<int16_t>(Lo);
296 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
297 Mod.setImm(NewModVal);
298 Old.ChangeToImmediate(SExt);
299 return true;
300 }
301 }
302
303 // This check is only useful for integer instructions
304 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
306 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
307 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
308 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
309 return true;
310 }
311 }
312 } else {
313 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
314 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
315 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
316 Old.ChangeToImmediate(Swapped);
317 return true;
318 }
319 }
320
321 return false;
322 };
323
324 if (tryFoldToInline(Imm))
325 return true;
326
327 // Replace integer addition by subtraction and vice versa if it allows
328 // folding the immediate to an inline constant.
329 //
330 // We should only ever get here for SrcIdx == 1 due to canonicalization
331 // earlier in the pipeline, but we double-check here to be safe / fully
332 // general.
333 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
334 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
335 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
336 unsigned ClampIdx =
337 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
338 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
339
340 if (!Clamp) {
341 uint16_t NegLo = -static_cast<uint16_t>(Imm);
342 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
343 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
344
345 if (tryFoldToInline(NegImm)) {
346 unsigned NegOpcode =
347 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
348 MI->setDesc(TII->get(NegOpcode));
349 return true;
350 }
351 }
352 }
353
354 return false;
355}
356
357bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
358 MachineInstr *MI = Fold.UseMI;
359 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
360 assert(Old.isReg());
361
362 if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
363 if (tryFoldImmWithOpSel(Fold))
364 return true;
365
366 // We can't represent the candidate as an inline constant. Try as a literal
367 // with the original opsel, checking constant bus limitations.
369 int OpNo = MI->getOperandNo(&Old);
370 if (!TII->isOperandLegal(*MI, OpNo, &New))
371 return false;
372 Old.ChangeToImmediate(Fold.ImmToFold);
373 return true;
374 }
375
376 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
377 MachineBasicBlock *MBB = MI->getParent();
378 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
379 if (Liveness != MachineBasicBlock::LQR_Dead) {
380 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
381 return false;
382 }
383
384 int Op32 = Fold.ShrinkOpcode;
385 MachineOperand &Dst0 = MI->getOperand(0);
386 MachineOperand &Dst1 = MI->getOperand(1);
387 assert(Dst0.isDef() && Dst1.isDef());
388
389 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
390
391 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
392 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
393
394 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
395
396 if (HaveNonDbgCarryUse) {
397 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
398 Dst1.getReg())
399 .addReg(AMDGPU::VCC, RegState::Kill);
400 }
401
402 // Keep the old instruction around to avoid breaking iterators, but
403 // replace it with a dummy instruction to remove uses.
404 //
405 // FIXME: We should not invert how this pass looks at operands to avoid
406 // this. Should track set of foldable movs instead of looking for uses
407 // when looking at a use.
408 Dst0.setReg(NewReg0);
409 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
410 MI->removeOperand(I);
411 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
412
413 if (Fold.Commuted)
414 TII->commuteInstruction(*Inst32, false);
415 return true;
416 }
417
418 assert(!Fold.needsShrink() && "not handled");
419
420 if (Fold.isImm()) {
421 if (Old.isTied()) {
422 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
423 if (NewMFMAOpc == -1)
424 return false;
425 MI->setDesc(TII->get(NewMFMAOpc));
426 MI->untieRegOperand(0);
427 }
428 Old.ChangeToImmediate(Fold.ImmToFold);
429 return true;
430 }
431
432 if (Fold.isGlobal()) {
433 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
434 Fold.OpToFold->getTargetFlags());
435 return true;
436 }
437
438 if (Fold.isFI()) {
439 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
440 return true;
441 }
442
443 MachineOperand *New = Fold.OpToFold;
444 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
445 Old.setIsUndef(New->isUndef());
446 return true;
447}
448
450 const MachineInstr *MI) {
451 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
452}
453
455 MachineInstr *MI, unsigned OpNo,
456 MachineOperand *FoldOp, bool Commuted = false,
457 int ShrinkOp = -1) {
458 // Skip additional folding on the same operand.
459 for (FoldCandidate &Fold : FoldList)
460 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
461 return;
462 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
463 << " operand " << OpNo << "\n " << *MI);
464 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
465}
466
467bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
468 MachineInstr *MI, unsigned OpNo,
469 MachineOperand *OpToFold) const {
470 const unsigned Opc = MI->getOpcode();
471
472 auto tryToFoldAsFMAAKorMK = [&]() {
473 if (!OpToFold->isImm())
474 return false;
475
476 const bool TryAK = OpNo == 3;
477 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
478 MI->setDesc(TII->get(NewOpc));
479
480 // We have to fold into operand which would be Imm not into OpNo.
481 bool FoldAsFMAAKorMK =
482 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
483 if (FoldAsFMAAKorMK) {
484 // Untie Src2 of fmac.
485 MI->untieRegOperand(3);
486 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
487 if (OpNo == 1) {
488 MachineOperand &Op1 = MI->getOperand(1);
489 MachineOperand &Op2 = MI->getOperand(2);
490 Register OldReg = Op1.getReg();
491 // Operand 2 might be an inlinable constant
492 if (Op2.isImm()) {
493 Op1.ChangeToImmediate(Op2.getImm());
494 Op2.ChangeToRegister(OldReg, false);
495 } else {
496 Op1.setReg(Op2.getReg());
497 Op2.setReg(OldReg);
498 }
499 }
500 return true;
501 }
502 MI->setDesc(TII->get(Opc));
503 return false;
504 };
505
506 bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
507 if (!IsLegal && OpToFold->isImm()) {
508 FoldCandidate Fold(MI, OpNo, OpToFold);
509 IsLegal = canUseImmWithOpSel(Fold);
510 }
511
512 if (!IsLegal) {
513 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
514 unsigned NewOpc = macToMad(Opc);
515 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
516 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
517 // to fold the operand.
518 MI->setDesc(TII->get(NewOpc));
519 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
520 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
521 if (AddOpSel)
522 MI->addOperand(MachineOperand::CreateImm(0));
523 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
524 if (FoldAsMAD) {
525 MI->untieRegOperand(OpNo);
526 return true;
527 }
528 if (AddOpSel)
529 MI->removeOperand(MI->getNumExplicitOperands() - 1);
530 MI->setDesc(TII->get(Opc));
531 }
532
533 // Special case for s_fmac_f32 if we are trying to fold into Src2.
534 // By transforming into fmaak we can untie Src2 and make folding legal.
535 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
536 if (tryToFoldAsFMAAKorMK())
537 return true;
538 }
539
540 // Special case for s_setreg_b32
541 if (OpToFold->isImm()) {
542 unsigned ImmOpc = 0;
543 if (Opc == AMDGPU::S_SETREG_B32)
544 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
545 else if (Opc == AMDGPU::S_SETREG_B32_mode)
546 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
547 if (ImmOpc) {
548 MI->setDesc(TII->get(ImmOpc));
549 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
550 return true;
551 }
552 }
553
554 // If we are already folding into another operand of MI, then
555 // we can't commute the instruction, otherwise we risk making the
556 // other fold illegal.
557 if (isUseMIInFoldList(FoldList, MI))
558 return false;
559
560 // Operand is not legal, so try to commute the instruction to
561 // see if this makes it possible to fold.
562 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
563 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
564 if (!CanCommute)
565 return false;
566
567 // One of operands might be an Imm operand, and OpNo may refer to it after
568 // the call of commuteInstruction() below. Such situations are avoided
569 // here explicitly as OpNo must be a register operand to be a candidate
570 // for memory folding.
571 if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
572 return false;
573
574 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
575 return false;
576
577 int Op32 = -1;
578 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
579 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
580 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
581 (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
582 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
583 return false;
584 }
585
586 // Verify the other operand is a VGPR, otherwise we would violate the
587 // constant bus restriction.
588 MachineOperand &OtherOp = MI->getOperand(OpNo);
589 if (!OtherOp.isReg() ||
590 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
591 return false;
592
593 assert(MI->getOperand(1).isDef());
594
595 // Make sure to get the 32-bit version of the commuted opcode.
596 unsigned MaybeCommutedOpc = MI->getOpcode();
597 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
598 }
599
600 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
601 return true;
602 }
603
604 // Inlineable constant might have been folded into Imm operand of fmaak or
605 // fmamk and we are trying to fold a non-inlinable constant.
606 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
607 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
608 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
609 MachineOperand &OpImm = MI->getOperand(ImmIdx);
610 if (!OpImm.isReg() &&
611 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
612 return tryToFoldAsFMAAKorMK();
613 }
614
615 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
616 // By changing into fmamk we can untie Src2.
617 // If folding for Src0 happens first and it is identical operand to Src1 we
618 // should avoid transforming into fmamk which requires commuting as it would
619 // cause folding into Src1 to fail later on due to wrong OpNo used.
620 if (Opc == AMDGPU::S_FMAC_F32 &&
621 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
622 if (tryToFoldAsFMAAKorMK())
623 return true;
624 }
625
626 // Check the case where we might introduce a second constant operand to a
627 // scalar instruction
628 if (TII->isSALU(MI->getOpcode())) {
629 const MCInstrDesc &InstDesc = MI->getDesc();
630 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
631
632 // Fine if the operand can be encoded as an inline constant
633 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
634 // Otherwise check for another constant
635 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
636 auto &Op = MI->getOperand(i);
637 if (OpNo != i && !Op.isReg() &&
638 !TII->isInlineConstant(Op, InstDesc.operands()[i]))
639 return false;
640 }
641 }
642 }
643
644 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
645 return true;
646}
647
648bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
649 const MachineOperand &UseMO) const {
650 // Operands of SDWA instructions must be registers.
651 return !TII->isSDWA(MI);
652}
653
654// Find a def of the UseReg, check if it is a reg_sequence and find initializers
655// for each subreg, tracking it to foldable inline immediate if possible.
656// Returns true on success.
657bool SIFoldOperands::getRegSeqInit(
658 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
659 Register UseReg, uint8_t OpTy) const {
660 MachineInstr *Def = MRI->getVRegDef(UseReg);
661 if (!Def || !Def->isRegSequence())
662 return false;
663
664 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
665 MachineOperand *Sub = &Def->getOperand(I);
666 assert(Sub->isReg());
667
668 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
669 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
670 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
671 SubDef = MRI->getVRegDef(Sub->getReg())) {
672 MachineOperand *Op = &SubDef->getOperand(1);
673 if (Op->isImm()) {
674 if (TII->isInlineConstant(*Op, OpTy))
675 Sub = Op;
676 break;
677 }
678 if (!Op->isReg() || Op->getReg().isPhysical())
679 break;
680 Sub = Op;
681 }
682
683 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
684 }
685
686 return true;
687}
688
689bool SIFoldOperands::tryToFoldACImm(
690 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
691 SmallVectorImpl<FoldCandidate> &FoldList) const {
692 const MCInstrDesc &Desc = UseMI->getDesc();
693 if (UseOpIdx >= Desc.getNumOperands())
694 return false;
695
697 return false;
698
699 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
700 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
701 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
702 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
703 return true;
704 }
705
706 if (!OpToFold.isReg())
707 return false;
708
709 Register UseReg = OpToFold.getReg();
710 if (!UseReg.isVirtual())
711 return false;
712
713 if (isUseMIInFoldList(FoldList, UseMI))
714 return false;
715
716 // Maybe it is just a COPY of an immediate itself.
717 MachineInstr *Def = MRI->getVRegDef(UseReg);
718 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
719 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
720 MachineOperand &DefOp = Def->getOperand(1);
721 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
722 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
723 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
724 return true;
725 }
726 }
727
729 if (!getRegSeqInit(Defs, UseReg, OpTy))
730 return false;
731
732 int32_t Imm;
733 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
734 const MachineOperand *Op = Defs[I].first;
735 if (!Op->isImm())
736 return false;
737
738 auto SubImm = Op->getImm();
739 if (!I) {
740 Imm = SubImm;
741 if (!TII->isInlineConstant(*Op, OpTy) ||
742 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
743 return false;
744
745 continue;
746 }
747 if (Imm != SubImm)
748 return false; // Can only fold splat constants
749 }
750
751 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
752 return true;
753}
754
755void SIFoldOperands::foldOperand(
756 MachineOperand &OpToFold,
758 int UseOpIdx,
760 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
761 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
762
763 if (!isUseSafeToFold(*UseMI, *UseOp))
764 return;
765
766 // FIXME: Fold operands with subregs.
767 if (UseOp->isReg() && OpToFold.isReg() &&
768 (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
769 return;
770
771 // Special case for REG_SEQUENCE: We can't fold literals into
772 // REG_SEQUENCE instructions, so we have to fold them into the
773 // uses of REG_SEQUENCE.
774 if (UseMI->isRegSequence()) {
775 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
776 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
777
778 // Grab the use operands first
780 for (auto &Use : MRI->use_nodbg_operands(RegSeqDstReg))
781 UsesToProcess.push_back(&Use);
782 for (auto *RSUse : UsesToProcess) {
783 MachineInstr *RSUseMI = RSUse->getParent();
784
785 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
786 RSUseMI->getOperandNo(RSUse), FoldList))
787 continue;
788
789 if (RSUse->getSubReg() != RegSeqDstSubReg)
790 continue;
791
792 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
793 CopiesToReplace);
794 }
795 return;
796 }
797
798 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
799 return;
800
801 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
802 // Verify that this is a stack access.
803 // FIXME: Should probably use stack pseudos before frame lowering.
804
805 if (TII->isMUBUF(*UseMI)) {
806 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
807 MFI->getScratchRSrcReg())
808 return;
809
810 // Ensure this is either relative to the current frame or the current
811 // wave.
812 MachineOperand &SOff =
813 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
814 if (!SOff.isImm() || SOff.getImm() != 0)
815 return;
816 }
817
818 // A frame index will resolve to a positive constant, so it should always be
819 // safe to fold the addressing mode, even pre-GFX9.
820 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
821
822 const unsigned Opc = UseMI->getOpcode();
823 if (TII->isFLATScratch(*UseMI) &&
824 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
825 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
826 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
827 UseMI->setDesc(TII->get(NewOpc));
828 }
829
830 return;
831 }
832
833 bool FoldingImmLike =
834 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
835
836 if (FoldingImmLike && UseMI->isCopy()) {
837 Register DestReg = UseMI->getOperand(0).getReg();
838 Register SrcReg = UseMI->getOperand(1).getReg();
839 assert(SrcReg.isVirtual());
840
841 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
842
843 // Don't fold into a copy to a physical register with the same class. Doing
844 // so would interfere with the register coalescer's logic which would avoid
845 // redundant initializations.
846 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
847 return;
848
849 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
850 if (!DestReg.isPhysical()) {
851 if (DestRC == &AMDGPU::AGPR_32RegClass &&
852 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
853 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
855 CopiesToReplace.push_back(UseMI);
856 return;
857 }
858 }
859
860 // In order to fold immediates into copies, we need to change the
861 // copy to a MOV.
862
863 unsigned MovOp = TII->getMovOpcode(DestRC);
864 if (MovOp == AMDGPU::COPY)
865 return;
866
869 while (ImpOpI != ImpOpE) {
870 MachineInstr::mop_iterator Tmp = ImpOpI;
871 ImpOpI++;
873 }
874 UseMI->setDesc(TII->get(MovOp));
875
876 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
877 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
878 MachineOperand NewSrcOp(SrcOp);
881 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
882 UseMI->addOperand(NewSrcOp); // src0
883 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
884 UseOpIdx = 2;
885 UseOp = &UseMI->getOperand(UseOpIdx);
886 }
887 CopiesToReplace.push_back(UseMI);
888 } else {
889 if (UseMI->isCopy() && OpToFold.isReg() &&
891 !UseMI->getOperand(1).getSubReg()) {
892 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
893 unsigned Size = TII->getOpSize(*UseMI, 1);
894 Register UseReg = OpToFold.getReg();
896 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
897 UseMI->getOperand(1).setIsKill(false);
898 CopiesToReplace.push_back(UseMI);
899 OpToFold.setIsKill(false);
900
901 // Remove kill flags as kills may now be out of order with uses.
902 MRI->clearKillFlags(OpToFold.getReg());
903
904 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
905 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
906 // its initializers right here, so we will rematerialize immediates and
907 // avoid copies via different reg classes.
909 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
910 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
911 const DebugLoc &DL = UseMI->getDebugLoc();
913
914 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
915 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
917
921 for (unsigned I = 0; I < Size / 4; ++I) {
922 MachineOperand *Def = Defs[I].first;
924 if (Def->isImm() &&
925 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
926 int64_t Imm = Def->getImm();
927
928 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
930 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
931 B.addReg(Tmp);
932 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
933 auto Src = getRegSubRegPair(*Def);
934 Def->setIsKill(false);
935 if (!SeenAGPRs.insert(Src)) {
936 // We cannot build a reg_sequence out of the same registers, they
937 // must be copied. Better do it here before copyPhysReg() created
938 // several reads to do the AGPR->VGPR->AGPR copy.
939 CopyToVGPR = Src;
940 } else {
941 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
942 Src.SubReg);
943 }
944 } else {
945 assert(Def->isReg());
946 Def->setIsKill(false);
947 auto Src = getRegSubRegPair(*Def);
948
949 // Direct copy from SGPR to AGPR is not possible. To avoid creation
950 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
951 // create a copy here and track if we already have such a copy.
952 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
953 CopyToVGPR = Src;
954 } else {
955 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
956 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
957 B.addReg(Tmp);
958 }
959 }
960
961 if (CopyToVGPR.Reg) {
962 Register Vgpr;
963 if (VGPRCopies.count(CopyToVGPR)) {
964 Vgpr = VGPRCopies[CopyToVGPR];
965 } else {
966 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
967 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
968 VGPRCopies[CopyToVGPR] = Vgpr;
969 }
970 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
972 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
973 B.addReg(Tmp);
974 }
975
976 B.addImm(Defs[I].second);
977 }
978 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
979 return;
980 }
981
982 if (Size != 4)
983 return;
984
985 Register Reg0 = UseMI->getOperand(0).getReg();
986 Register Reg1 = UseMI->getOperand(1).getReg();
987 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
988 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
989 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
990 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
991 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
992 TRI->isAGPR(*MRI, Reg1))
993 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
994 return;
995 }
996
997 unsigned UseOpc = UseMI->getOpcode();
998 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
999 (UseOpc == AMDGPU::V_READLANE_B32 &&
1000 (int)UseOpIdx ==
1001 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1002 // %vgpr = V_MOV_B32 imm
1003 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1004 // =>
1005 // %sgpr = S_MOV_B32 imm
1006 if (FoldingImmLike) {
1008 UseMI->getOperand(UseOpIdx).getReg(),
1009 *OpToFold.getParent(),
1010 *UseMI))
1011 return;
1012
1013 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1014
1015 if (OpToFold.isImm())
1016 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1017 else
1019 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1020 return;
1021 }
1022
1023 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1025 UseMI->getOperand(UseOpIdx).getReg(),
1026 *OpToFold.getParent(),
1027 *UseMI))
1028 return;
1029
1030 // %vgpr = COPY %sgpr0
1031 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1032 // =>
1033 // %sgpr1 = COPY %sgpr0
1034 UseMI->setDesc(TII->get(AMDGPU::COPY));
1035 UseMI->getOperand(1).setReg(OpToFold.getReg());
1036 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1037 UseMI->getOperand(1).setIsKill(false);
1038 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1039 return;
1040 }
1041 }
1042
1043 const MCInstrDesc &UseDesc = UseMI->getDesc();
1044
1045 // Don't fold into target independent nodes. Target independent opcodes
1046 // don't have defined register classes.
1047 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1048 UseDesc.operands()[UseOpIdx].RegClass == -1)
1049 return;
1050 }
1051
1052 if (!FoldingImmLike) {
1053 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1054 // Don't fold if OpToFold doesn't hold an aligned register.
1055 const TargetRegisterClass *RC =
1056 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1057 assert(RC);
1058 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
1059 unsigned SubReg = OpToFold.getSubReg();
1060 if (const TargetRegisterClass *SubRC =
1061 TRI->getSubRegisterClass(RC, SubReg))
1062 RC = SubRC;
1063 }
1064
1065 if (!RC || !TRI->isProperlyAlignedRC(*RC))
1066 return;
1067 }
1068
1069 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1070
1071 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1072 // to enable more folding opportunities. The shrink operands pass
1073 // already does this.
1074 return;
1075 }
1076
1077
1078 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
1079 const TargetRegisterClass *FoldRC =
1080 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
1081
1082 // Split 64-bit constants into 32-bits for folding.
1083 if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
1084 Register UseReg = UseOp->getReg();
1085 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
1086 if (AMDGPU::getRegBitWidth(*UseRC) != 64)
1087 return;
1088
1089 APInt Imm(64, OpToFold.getImm());
1090 if (UseOp->getSubReg() == AMDGPU::sub0) {
1091 Imm = Imm.getLoBits(32);
1092 } else {
1093 assert(UseOp->getSubReg() == AMDGPU::sub1);
1094 Imm = Imm.getHiBits(32);
1095 }
1096
1097 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
1098 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
1099 return;
1100 }
1101
1102 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1103}
1104
1105static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1106 uint32_t LHS, uint32_t RHS) {
1107 switch (Opcode) {
1108 case AMDGPU::V_AND_B32_e64:
1109 case AMDGPU::V_AND_B32_e32:
1110 case AMDGPU::S_AND_B32:
1111 Result = LHS & RHS;
1112 return true;
1113 case AMDGPU::V_OR_B32_e64:
1114 case AMDGPU::V_OR_B32_e32:
1115 case AMDGPU::S_OR_B32:
1116 Result = LHS | RHS;
1117 return true;
1118 case AMDGPU::V_XOR_B32_e64:
1119 case AMDGPU::V_XOR_B32_e32:
1120 case AMDGPU::S_XOR_B32:
1121 Result = LHS ^ RHS;
1122 return true;
1123 case AMDGPU::S_XNOR_B32:
1124 Result = ~(LHS ^ RHS);
1125 return true;
1126 case AMDGPU::S_NAND_B32:
1127 Result = ~(LHS & RHS);
1128 return true;
1129 case AMDGPU::S_NOR_B32:
1130 Result = ~(LHS | RHS);
1131 return true;
1132 case AMDGPU::S_ANDN2_B32:
1133 Result = LHS & ~RHS;
1134 return true;
1135 case AMDGPU::S_ORN2_B32:
1136 Result = LHS | ~RHS;
1137 return true;
1138 case AMDGPU::V_LSHL_B32_e64:
1139 case AMDGPU::V_LSHL_B32_e32:
1140 case AMDGPU::S_LSHL_B32:
1141 // The instruction ignores the high bits for out of bounds shifts.
1142 Result = LHS << (RHS & 31);
1143 return true;
1144 case AMDGPU::V_LSHLREV_B32_e64:
1145 case AMDGPU::V_LSHLREV_B32_e32:
1146 Result = RHS << (LHS & 31);
1147 return true;
1148 case AMDGPU::V_LSHR_B32_e64:
1149 case AMDGPU::V_LSHR_B32_e32:
1150 case AMDGPU::S_LSHR_B32:
1151 Result = LHS >> (RHS & 31);
1152 return true;
1153 case AMDGPU::V_LSHRREV_B32_e64:
1154 case AMDGPU::V_LSHRREV_B32_e32:
1155 Result = RHS >> (LHS & 31);
1156 return true;
1157 case AMDGPU::V_ASHR_I32_e64:
1158 case AMDGPU::V_ASHR_I32_e32:
1159 case AMDGPU::S_ASHR_I32:
1160 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1161 return true;
1162 case AMDGPU::V_ASHRREV_I32_e64:
1163 case AMDGPU::V_ASHRREV_I32_e32:
1164 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1165 return true;
1166 default:
1167 return false;
1168 }
1169}
1170
1171static unsigned getMovOpc(bool IsScalar) {
1172 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1173}
1174
1175static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1176 MI.setDesc(NewDesc);
1177
1178 // Remove any leftover implicit operands from mutating the instruction. e.g.
1179 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1180 // anymore.
1181 const MCInstrDesc &Desc = MI.getDesc();
1182 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1183 Desc.implicit_defs().size();
1184
1185 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1186 MI.removeOperand(I);
1187}
1188
1190SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1191 // If this has a subregister, it obviously is a register source.
1192 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1193 !Op.getReg().isVirtual())
1194 return &Op;
1195
1196 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1197 if (Def && Def->isMoveImmediate()) {
1198 MachineOperand &ImmSrc = Def->getOperand(1);
1199 if (ImmSrc.isImm())
1200 return &ImmSrc;
1201 }
1202
1203 return &Op;
1204}
1205
1206// Try to simplify operations with a constant that may appear after instruction
1207// selection.
1208// TODO: See if a frame index with a fixed offset can fold.
1209bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1210 if (!MI->allImplicitDefsAreDead())
1211 return false;
1212
1213 unsigned Opc = MI->getOpcode();
1214
1215 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1216 if (Src0Idx == -1)
1217 return false;
1218 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1219
1220 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1221 Opc == AMDGPU::S_NOT_B32) &&
1222 Src0->isImm()) {
1223 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1224 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1225 return true;
1226 }
1227
1228 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1229 if (Src1Idx == -1)
1230 return false;
1231 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1232
1233 if (!Src0->isImm() && !Src1->isImm())
1234 return false;
1235
1236 // and k0, k1 -> v_mov_b32 (k0 & k1)
1237 // or k0, k1 -> v_mov_b32 (k0 | k1)
1238 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1239 if (Src0->isImm() && Src1->isImm()) {
1240 int32_t NewImm;
1241 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1242 return false;
1243
1244 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1245
1246 // Be careful to change the right operand, src0 may belong to a different
1247 // instruction.
1248 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1249 MI->removeOperand(Src1Idx);
1250 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1251 return true;
1252 }
1253
1254 if (!MI->isCommutable())
1255 return false;
1256
1257 if (Src0->isImm() && !Src1->isImm()) {
1258 std::swap(Src0, Src1);
1259 std::swap(Src0Idx, Src1Idx);
1260 }
1261
1262 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1263 if (Opc == AMDGPU::V_OR_B32_e64 ||
1264 Opc == AMDGPU::V_OR_B32_e32 ||
1265 Opc == AMDGPU::S_OR_B32) {
1266 if (Src1Val == 0) {
1267 // y = or x, 0 => y = copy x
1268 MI->removeOperand(Src1Idx);
1269 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1270 } else if (Src1Val == -1) {
1271 // y = or x, -1 => y = v_mov_b32 -1
1272 MI->removeOperand(Src1Idx);
1273 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1274 } else
1275 return false;
1276
1277 return true;
1278 }
1279
1280 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1281 Opc == AMDGPU::S_AND_B32) {
1282 if (Src1Val == 0) {
1283 // y = and x, 0 => y = v_mov_b32 0
1284 MI->removeOperand(Src0Idx);
1285 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1286 } else if (Src1Val == -1) {
1287 // y = and x, -1 => y = copy x
1288 MI->removeOperand(Src1Idx);
1289 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1290 } else
1291 return false;
1292
1293 return true;
1294 }
1295
1296 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1297 Opc == AMDGPU::S_XOR_B32) {
1298 if (Src1Val == 0) {
1299 // y = xor x, 0 => y = copy x
1300 MI->removeOperand(Src1Idx);
1301 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1302 return true;
1303 }
1304 }
1305
1306 return false;
1307}
1308
1309// Try to fold an instruction into a simpler one
1310bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1311 unsigned Opc = MI.getOpcode();
1312 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1313 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1314 return false;
1315
1316 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1317 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1318 if (!Src1->isIdenticalTo(*Src0)) {
1319 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1320 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1321 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1322 return false;
1323 }
1324
1325 int Src1ModIdx =
1326 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1327 int Src0ModIdx =
1328 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1329 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1330 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1331 return false;
1332
1333 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1334 auto &NewDesc =
1335 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1336 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1337 if (Src2Idx != -1)
1338 MI.removeOperand(Src2Idx);
1339 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1340 if (Src1ModIdx != -1)
1341 MI.removeOperand(Src1ModIdx);
1342 if (Src0ModIdx != -1)
1343 MI.removeOperand(Src0ModIdx);
1344 mutateCopyOp(MI, NewDesc);
1345 LLVM_DEBUG(dbgs() << MI);
1346 return true;
1347}
1348
1349bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1350 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1351 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1352 return false;
1353
1354 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1355 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1356 return false;
1357
1358 Register Src1 = MI.getOperand(2).getReg();
1359 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1360 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1361 return false;
1362
1363 Register Dst = MI.getOperand(0).getReg();
1364 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
1365 MI.eraseFromParent();
1366 return true;
1367}
1368
1369bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1370 MachineOperand &OpToFold) const {
1371 // We need mutate the operands of new mov instructions to add implicit
1372 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1373 // this.
1374 SmallVector<MachineInstr *, 4> CopiesToReplace;
1376 MachineOperand &Dst = MI.getOperand(0);
1377 bool Changed = false;
1378
1379 if (OpToFold.isImm()) {
1380 for (auto &UseMI :
1381 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1382 // Folding the immediate may reveal operations that can be constant
1383 // folded or replaced with a copy. This can happen for example after
1384 // frame indices are lowered to constants or from splitting 64-bit
1385 // constants.
1386 //
1387 // We may also encounter cases where one or both operands are
1388 // immediates materialized into a register, which would ordinarily not
1389 // be folded due to multiple uses or operand constraints.
1390 if (tryConstantFoldOp(&UseMI)) {
1391 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1392 Changed = true;
1393 }
1394 }
1395 }
1396
1398 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1399 UsesToProcess.push_back(&Use);
1400 for (auto *U : UsesToProcess) {
1401 MachineInstr *UseMI = U->getParent();
1402 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1403 CopiesToReplace);
1404 }
1405
1406 if (CopiesToReplace.empty() && FoldList.empty())
1407 return Changed;
1408
1409 MachineFunction *MF = MI.getParent()->getParent();
1410 // Make sure we add EXEC uses to any new v_mov instructions created.
1411 for (MachineInstr *Copy : CopiesToReplace)
1412 Copy->addImplicitDefUseOperands(*MF);
1413
1414 for (FoldCandidate &Fold : FoldList) {
1415 assert(!Fold.isReg() || Fold.OpToFold);
1416 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1417 Register Reg = Fold.OpToFold->getReg();
1418 MachineInstr *DefMI = Fold.OpToFold->getParent();
1419 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1420 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1421 continue;
1422 }
1423 if (updateOperand(Fold)) {
1424 // Clear kill flags.
1425 if (Fold.isReg()) {
1426 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1427 // FIXME: Probably shouldn't bother trying to fold if not an
1428 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1429 // copies.
1430 MRI->clearKillFlags(Fold.OpToFold->getReg());
1431 }
1432 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1433 << static_cast<int>(Fold.UseOpNo) << " of "
1434 << *Fold.UseMI);
1435 } else if (Fold.Commuted) {
1436 // Restoring instruction's original operand order if fold has failed.
1437 TII->commuteInstruction(*Fold.UseMI, false);
1438 }
1439 }
1440 return true;
1441}
1442
1443bool SIFoldOperands::tryFoldFoldableCopy(
1444 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1445 // Specially track simple redefs of m0 to the same value in a block, so we
1446 // can erase the later ones.
1447 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1448 MachineOperand &NewM0Val = MI.getOperand(1);
1449 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1450 MI.eraseFromParent();
1451 return true;
1452 }
1453
1454 // We aren't tracking other physical registers
1455 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1456 ? nullptr
1457 : &NewM0Val;
1458 return false;
1459 }
1460
1461 MachineOperand &OpToFold = MI.getOperand(1);
1462 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1463
1464 // FIXME: We could also be folding things like TargetIndexes.
1465 if (!FoldingImm && !OpToFold.isReg())
1466 return false;
1467
1468 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1469 return false;
1470
1471 // Prevent folding operands backwards in the function. For example,
1472 // the COPY opcode must not be replaced by 1 in this example:
1473 //
1474 // %3 = COPY %vgpr0; VGPR_32:%3
1475 // ...
1476 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1477 if (!MI.getOperand(0).getReg().isVirtual())
1478 return false;
1479
1480 bool Changed = foldInstOperand(MI, OpToFold);
1481
1482 // If we managed to fold all uses of this copy then we might as well
1483 // delete it now.
1484 // The only reason we need to follow chains of copies here is that
1485 // tryFoldRegSequence looks forward through copies before folding a
1486 // REG_SEQUENCE into its eventual users.
1487 auto *InstToErase = &MI;
1488 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1489 auto &SrcOp = InstToErase->getOperand(1);
1490 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1491 InstToErase->eraseFromParent();
1492 Changed = true;
1493 InstToErase = nullptr;
1494 if (!SrcReg || SrcReg.isPhysical())
1495 break;
1496 InstToErase = MRI->getVRegDef(SrcReg);
1497 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1498 break;
1499 }
1500
1501 if (InstToErase && InstToErase->isRegSequence() &&
1502 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1503 InstToErase->eraseFromParent();
1504 Changed = true;
1505 }
1506
1507 return Changed;
1508}
1509
1510// Clamp patterns are canonically selected to v_max_* instructions, so only
1511// handle them.
1512const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1513 unsigned Op = MI.getOpcode();
1514 switch (Op) {
1515 case AMDGPU::V_MAX_F32_e64:
1516 case AMDGPU::V_MAX_F16_e64:
1517 case AMDGPU::V_MAX_F16_t16_e64:
1518 case AMDGPU::V_MAX_F16_fake16_e64:
1519 case AMDGPU::V_MAX_F64_e64:
1520 case AMDGPU::V_MAX_NUM_F64_e64:
1521 case AMDGPU::V_PK_MAX_F16: {
1522 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1523 return nullptr;
1524
1525 // Make sure sources are identical.
1526 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1527 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1528 if (!Src0->isReg() || !Src1->isReg() ||
1529 Src0->getReg() != Src1->getReg() ||
1530 Src0->getSubReg() != Src1->getSubReg() ||
1531 Src0->getSubReg() != AMDGPU::NoSubRegister)
1532 return nullptr;
1533
1534 // Can't fold up if we have modifiers.
1535 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1536 return nullptr;
1537
1538 unsigned Src0Mods
1539 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1540 unsigned Src1Mods
1541 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1542
1543 // Having a 0 op_sel_hi would require swizzling the output in the source
1544 // instruction, which we can't do.
1545 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1546 : 0u;
1547 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1548 return nullptr;
1549 return Src0;
1550 }
1551 default:
1552 return nullptr;
1553 }
1554}
1555
1556// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1557bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1558 const MachineOperand *ClampSrc = isClamp(MI);
1559 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1560 return false;
1561
1562 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1563
1564 // The type of clamp must be compatible.
1565 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1566 return false;
1567
1568 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1569 if (!DefClamp)
1570 return false;
1571
1572 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1573
1574 // Clamp is applied after omod, so it is OK if omod is set.
1575 DefClamp->setImm(1);
1576 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1577 MI.eraseFromParent();
1578
1579 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1580 // instruction, so we might as well convert it to the more flexible VOP3-only
1581 // mad/fma form.
1582 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1583 Def->eraseFromParent();
1584
1585 return true;
1586}
1587
1588static int getOModValue(unsigned Opc, int64_t Val) {
1589 switch (Opc) {
1590 case AMDGPU::V_MUL_F64_e64:
1591 case AMDGPU::V_MUL_F64_pseudo_e64: {
1592 switch (Val) {
1593 case 0x3fe0000000000000: // 0.5
1594 return SIOutMods::DIV2;
1595 case 0x4000000000000000: // 2.0
1596 return SIOutMods::MUL2;
1597 case 0x4010000000000000: // 4.0
1598 return SIOutMods::MUL4;
1599 default:
1600 return SIOutMods::NONE;
1601 }
1602 }
1603 case AMDGPU::V_MUL_F32_e64: {
1604 switch (static_cast<uint32_t>(Val)) {
1605 case 0x3f000000: // 0.5
1606 return SIOutMods::DIV2;
1607 case 0x40000000: // 2.0
1608 return SIOutMods::MUL2;
1609 case 0x40800000: // 4.0
1610 return SIOutMods::MUL4;
1611 default:
1612 return SIOutMods::NONE;
1613 }
1614 }
1615 case AMDGPU::V_MUL_F16_e64:
1616 case AMDGPU::V_MUL_F16_t16_e64:
1617 case AMDGPU::V_MUL_F16_fake16_e64: {
1618 switch (static_cast<uint16_t>(Val)) {
1619 case 0x3800: // 0.5
1620 return SIOutMods::DIV2;
1621 case 0x4000: // 2.0
1622 return SIOutMods::MUL2;
1623 case 0x4400: // 4.0
1624 return SIOutMods::MUL4;
1625 default:
1626 return SIOutMods::NONE;
1627 }
1628 }
1629 default:
1630 llvm_unreachable("invalid mul opcode");
1631 }
1632}
1633
1634// FIXME: Does this really not support denormals with f16?
1635// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1636// handled, so will anything other than that break?
1637std::pair<const MachineOperand *, int>
1638SIFoldOperands::isOMod(const MachineInstr &MI) const {
1639 unsigned Op = MI.getOpcode();
1640 switch (Op) {
1641 case AMDGPU::V_MUL_F64_e64:
1642 case AMDGPU::V_MUL_F64_pseudo_e64:
1643 case AMDGPU::V_MUL_F32_e64:
1644 case AMDGPU::V_MUL_F16_t16_e64:
1645 case AMDGPU::V_MUL_F16_fake16_e64:
1646 case AMDGPU::V_MUL_F16_e64: {
1647 // If output denormals are enabled, omod is ignored.
1648 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1649 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1650 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1651 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
1652 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1653 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1654 return std::pair(nullptr, SIOutMods::NONE);
1655
1656 const MachineOperand *RegOp = nullptr;
1657 const MachineOperand *ImmOp = nullptr;
1658 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1659 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1660 if (Src0->isImm()) {
1661 ImmOp = Src0;
1662 RegOp = Src1;
1663 } else if (Src1->isImm()) {
1664 ImmOp = Src1;
1665 RegOp = Src0;
1666 } else
1667 return std::pair(nullptr, SIOutMods::NONE);
1668
1669 int OMod = getOModValue(Op, ImmOp->getImm());
1670 if (OMod == SIOutMods::NONE ||
1671 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1672 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1673 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1674 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1675 return std::pair(nullptr, SIOutMods::NONE);
1676
1677 return std::pair(RegOp, OMod);
1678 }
1679 case AMDGPU::V_ADD_F64_e64:
1680 case AMDGPU::V_ADD_F64_pseudo_e64:
1681 case AMDGPU::V_ADD_F32_e64:
1682 case AMDGPU::V_ADD_F16_e64:
1683 case AMDGPU::V_ADD_F16_t16_e64:
1684 case AMDGPU::V_ADD_F16_fake16_e64: {
1685 // If output denormals are enabled, omod is ignored.
1686 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1687 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1688 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1689 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
1690 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1691 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1692 return std::pair(nullptr, SIOutMods::NONE);
1693
1694 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1695 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1696 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1697
1698 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1699 Src0->getSubReg() == Src1->getSubReg() &&
1700 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1701 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1702 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1703 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1704 return std::pair(Src0, SIOutMods::MUL2);
1705
1706 return std::pair(nullptr, SIOutMods::NONE);
1707 }
1708 default:
1709 return std::pair(nullptr, SIOutMods::NONE);
1710 }
1711}
1712
1713// FIXME: Does this need to check IEEE bit on function?
1714bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1715 const MachineOperand *RegOp;
1716 int OMod;
1717 std::tie(RegOp, OMod) = isOMod(MI);
1718 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1719 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1720 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1721 return false;
1722
1723 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1724 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1725 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1726 return false;
1727
1728 // Clamp is applied after omod. If the source already has clamp set, don't
1729 // fold it.
1730 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1731 return false;
1732
1733 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1734
1735 DefOMod->setImm(OMod);
1736 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1737 MI.eraseFromParent();
1738
1739 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1740 // instruction, so we might as well convert it to the more flexible VOP3-only
1741 // mad/fma form.
1742 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1743 Def->eraseFromParent();
1744
1745 return true;
1746}
1747
1748// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1749// instruction which can take an agpr. So far that means a store.
1750bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1751 assert(MI.isRegSequence());
1752 auto Reg = MI.getOperand(0).getReg();
1753
1754 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1755 !MRI->hasOneNonDBGUse(Reg))
1756 return false;
1757
1759 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1760 return false;
1761
1762 for (auto &Def : Defs) {
1763 const auto *Op = Def.first;
1764 if (!Op->isReg())
1765 return false;
1766 if (TRI->isAGPR(*MRI, Op->getReg()))
1767 continue;
1768 // Maybe this is a COPY from AREG
1769 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1770 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1771 return false;
1772 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1773 return false;
1774 }
1775
1776 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1777 MachineInstr *UseMI = Op->getParent();
1778 while (UseMI->isCopy() && !Op->getSubReg()) {
1779 Reg = UseMI->getOperand(0).getReg();
1780 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1781 return false;
1782 Op = &*MRI->use_nodbg_begin(Reg);
1783 UseMI = Op->getParent();
1784 }
1785
1786 if (Op->getSubReg())
1787 return false;
1788
1789 unsigned OpIdx = Op - &UseMI->getOperand(0);
1790 const MCInstrDesc &InstDesc = UseMI->getDesc();
1791 const TargetRegisterClass *OpRC =
1792 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1793 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1794 return false;
1795
1796 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1797 auto Dst = MRI->createVirtualRegister(NewDstRC);
1798 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1799 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1800
1801 for (unsigned I = 0; I < Defs.size(); ++I) {
1802 MachineOperand *Def = Defs[I].first;
1803 Def->setIsKill(false);
1804 if (TRI->isAGPR(*MRI, Def->getReg())) {
1805 RS.add(*Def);
1806 } else { // This is a copy
1807 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1808 SubDef->getOperand(1).setIsKill(false);
1809 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1810 }
1811 RS.addImm(Defs[I].second);
1812 }
1813
1814 Op->setReg(Dst);
1815 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1816 Op->setReg(Reg);
1817 RS->eraseFromParent();
1818 return false;
1819 }
1820
1821 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1822
1823 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1824 // in which case we can erase them all later in runOnMachineFunction.
1825 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1826 MI.eraseFromParent();
1827 return true;
1828}
1829
1830/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1831/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1832static bool isAGPRCopy(const SIRegisterInfo &TRI,
1833 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1834 Register &OutReg, unsigned &OutSubReg) {
1835 assert(Copy.isCopy());
1836
1837 const MachineOperand &CopySrc = Copy.getOperand(1);
1838 Register CopySrcReg = CopySrc.getReg();
1839 if (!CopySrcReg.isVirtual())
1840 return false;
1841
1842 // Common case: copy from AGPR directly, e.g.
1843 // %1:vgpr_32 = COPY %0:agpr_32
1844 if (TRI.isAGPR(MRI, CopySrcReg)) {
1845 OutReg = CopySrcReg;
1846 OutSubReg = CopySrc.getSubReg();
1847 return true;
1848 }
1849
1850 // Sometimes it can also involve two copies, e.g.
1851 // %1:vgpr_256 = COPY %0:agpr_256
1852 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
1853 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
1854 if (!CopySrcDef || !CopySrcDef->isCopy())
1855 return false;
1856
1857 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
1858 Register OtherCopySrcReg = OtherCopySrc.getReg();
1859 if (!OtherCopySrcReg.isVirtual() ||
1860 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
1861 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
1862 !TRI.isAGPR(MRI, OtherCopySrcReg))
1863 return false;
1864
1865 OutReg = OtherCopySrcReg;
1866 OutSubReg = CopySrc.getSubReg();
1867 return true;
1868}
1869
1870// Try to hoist an AGPR to VGPR copy across a PHI.
1871// This should allow folding of an AGPR into a consumer which may support it.
1872//
1873// Example 1: LCSSA PHI
1874// loop:
1875// %1:vreg = COPY %0:areg
1876// exit:
1877// %2:vreg = PHI %1:vreg, %loop
1878// =>
1879// loop:
1880// exit:
1881// %1:areg = PHI %0:areg, %loop
1882// %2:vreg = COPY %1:areg
1883//
1884// Example 2: PHI with multiple incoming values:
1885// entry:
1886// %1:vreg = GLOBAL_LOAD(..)
1887// loop:
1888// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1889// %3:areg = COPY %2:vreg
1890// %4:areg = (instr using %3:areg)
1891// %5:vreg = COPY %4:areg
1892// =>
1893// entry:
1894// %1:vreg = GLOBAL_LOAD(..)
1895// %2:areg = COPY %1:vreg
1896// loop:
1897// %3:areg = PHI %2:areg, %entry, %X:areg,
1898// %4:areg = (instr using %3:areg)
1899bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1900 assert(PHI.isPHI());
1901
1902 Register PhiOut = PHI.getOperand(0).getReg();
1903 if (!TRI->isVGPR(*MRI, PhiOut))
1904 return false;
1905
1906 // Iterate once over all incoming values of the PHI to check if this PHI is
1907 // eligible, and determine the exact AGPR RC we'll target.
1908 const TargetRegisterClass *ARC = nullptr;
1909 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1910 MachineOperand &MO = PHI.getOperand(K);
1911 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
1912 if (!Copy || !Copy->isCopy())
1913 continue;
1914
1915 Register AGPRSrc;
1916 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1917 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
1918 continue;
1919
1920 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
1921 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1922 CopyInRC = SubRC;
1923
1924 if (ARC && !ARC->hasSubClassEq(CopyInRC))
1925 return false;
1926 ARC = CopyInRC;
1927 }
1928
1929 if (!ARC)
1930 return false;
1931
1932 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1933
1934 // Rewrite the PHI's incoming values to ARC.
1935 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1936 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1937 MachineOperand &MO = PHI.getOperand(K);
1938 Register Reg = MO.getReg();
1939
1941 MachineBasicBlock *InsertMBB = nullptr;
1942
1943 // Look at the def of Reg, ignoring all copies.
1944 unsigned CopyOpc = AMDGPU::COPY;
1945 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1946
1947 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1948 // the copy was single-use, it will be removed by DCE later.
1949 if (Def->isCopy()) {
1950 Register AGPRSrc;
1951 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1952 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
1953 MO.setReg(AGPRSrc);
1954 MO.setSubReg(AGPRSubReg);
1955 continue;
1956 }
1957
1958 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1959 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1960 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1961 // is unlikely to be profitable.
1962 //
1963 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1964 MachineOperand &CopyIn = Def->getOperand(1);
1965 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1966 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1967 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1968 }
1969
1970 InsertMBB = Def->getParent();
1971 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
1972 } else {
1973 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1974 InsertPt = InsertMBB->getFirstTerminator();
1975 }
1976
1977 Register NewReg = MRI->createVirtualRegister(ARC);
1978 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
1979 TII->get(CopyOpc), NewReg)
1980 .addReg(Reg);
1981 MO.setReg(NewReg);
1982
1983 (void)MI;
1984 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
1985 }
1986
1987 // Replace the PHI's result with a new register.
1988 Register NewReg = MRI->createVirtualRegister(ARC);
1989 PHI.getOperand(0).setReg(NewReg);
1990
1991 // COPY that new register back to the original PhiOut register. This COPY will
1992 // usually be folded out later.
1993 MachineBasicBlock *MBB = PHI.getParent();
1994 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
1995 TII->get(AMDGPU::COPY), PhiOut)
1996 .addReg(NewReg);
1997
1998 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
1999 return true;
2000}
2001
2002// Attempt to convert VGPR load to an AGPR load.
2003bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
2004 assert(MI.mayLoad());
2005 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2006 return false;
2007
2008 MachineOperand &Def = MI.getOperand(0);
2009 if (!Def.isDef())
2010 return false;
2011
2012 Register DefReg = Def.getReg();
2013
2014 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2015 return false;
2016
2018 SmallVector<Register, 8> MoveRegs;
2019 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
2020 Users.push_back(&I);
2021
2022 if (Users.empty())
2023 return false;
2024
2025 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2026 while (!Users.empty()) {
2027 const MachineInstr *I = Users.pop_back_val();
2028 if (!I->isCopy() && !I->isRegSequence())
2029 return false;
2030 Register DstReg = I->getOperand(0).getReg();
2031 // Physical registers may have more than one instruction definitions
2032 if (DstReg.isPhysical())
2033 return false;
2034 if (TRI->isAGPR(*MRI, DstReg))
2035 continue;
2036 MoveRegs.push_back(DstReg);
2037 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2038 Users.push_back(&U);
2039 }
2040
2041 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2042 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2043 if (!TII->isOperandLegal(MI, 0, &Def)) {
2044 MRI->setRegClass(DefReg, RC);
2045 return false;
2046 }
2047
2048 while (!MoveRegs.empty()) {
2049 Register Reg = MoveRegs.pop_back_val();
2050 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2051 }
2052
2053 LLVM_DEBUG(dbgs() << "Folded " << MI);
2054
2055 return true;
2056}
2057
2058// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2059// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2060// there's cases where it can create a lot more AGPR-AGPR copies, which are
2061// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2062//
2063// This function looks at all AGPR PHIs in a basic block and collects their
2064// operands. Then, it checks for register that are used more than once across
2065// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2066// having to create one VGPR temporary per use, which can get very messy if
2067// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2068// element).
2069//
2070// Example
2071// a:
2072// %in:agpr_256 = COPY %foo:vgpr_256
2073// c:
2074// %x:agpr_32 = ..
2075// b:
2076// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2077// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2078// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2079// =>
2080// a:
2081// %in:agpr_256 = COPY %foo:vgpr_256
2082// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2083// %tmp_agpr:agpr_32 = COPY %tmp
2084// c:
2085// %x:agpr_32 = ..
2086// b:
2087// %0:areg = PHI %tmp_agpr, %a, %x, %c
2088// %1:areg = PHI %tmp_agpr, %a, %y, %c
2089// %2:areg = PHI %tmp_agpr, %a, %z, %c
2090bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2091 // This is only really needed on GFX908 where AGPR-AGPR copies are
2092 // unreasonably difficult.
2093 if (ST->hasGFX90AInsts())
2094 return false;
2095
2096 // Look at all AGPR Phis and collect the register + subregister used.
2097 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2098 RegToMO;
2099
2100 for (auto &MI : MBB) {
2101 if (!MI.isPHI())
2102 break;
2103
2104 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2105 continue;
2106
2107 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2108 MachineOperand &PhiMO = MI.getOperand(K);
2109 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2110 }
2111 }
2112
2113 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2114 // a VGPR.
2115 bool Changed = false;
2116 for (const auto &[Entry, MOs] : RegToMO) {
2117 if (MOs.size() == 1)
2118 continue;
2119
2120 const auto [Reg, SubReg] = Entry;
2121 MachineInstr *Def = MRI->getVRegDef(Reg);
2122 MachineBasicBlock *DefMBB = Def->getParent();
2123
2124 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2125 // out.
2126 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2127 Register TempVGPR =
2128 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2129 MachineInstr *VGPRCopy =
2130 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2131 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2132 .addReg(Reg, /* flags */ 0, SubReg);
2133
2134 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2135 Register TempAGPR = MRI->createVirtualRegister(ARC);
2136 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2137 TII->get(AMDGPU::COPY), TempAGPR)
2138 .addReg(TempVGPR);
2139
2140 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2141 for (MachineOperand *MO : MOs) {
2142 MO->setReg(TempAGPR);
2143 MO->setSubReg(AMDGPU::NoSubRegister);
2144 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2145 }
2146
2147 Changed = true;
2148 }
2149
2150 return Changed;
2151}
2152
2153bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
2154 if (skipFunction(MF.getFunction()))
2155 return false;
2156
2157 MRI = &MF.getRegInfo();
2158 ST = &MF.getSubtarget<GCNSubtarget>();
2159 TII = ST->getInstrInfo();
2160 TRI = &TII->getRegisterInfo();
2161 MFI = MF.getInfo<SIMachineFunctionInfo>();
2162
2163 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2164 // correctly handle signed zeros.
2165 //
2166 // FIXME: Also need to check strictfp
2167 bool IsIEEEMode = MFI->getMode().IEEE;
2168 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2169
2170 bool Changed = false;
2171 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2172 MachineOperand *CurrentKnownM0Val = nullptr;
2173 for (auto &MI : make_early_inc_range(*MBB)) {
2174 Changed |= tryFoldCndMask(MI);
2175
2176 if (tryFoldZeroHighBits(MI)) {
2177 Changed = true;
2178 continue;
2179 }
2180
2181 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2182 Changed = true;
2183 continue;
2184 }
2185
2186 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2187 Changed = true;
2188 continue;
2189 }
2190
2191 if (MI.mayLoad() && tryFoldLoad(MI)) {
2192 Changed = true;
2193 continue;
2194 }
2195
2196 if (TII->isFoldableCopy(MI)) {
2197 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2198 continue;
2199 }
2200
2201 // Saw an unknown clobber of m0, so we no longer know what it is.
2202 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2203 CurrentKnownM0Val = nullptr;
2204
2205 // TODO: Omod might be OK if there is NSZ only on the source
2206 // instruction, and not the omod multiply.
2207 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2208 !tryFoldOMod(MI))
2209 Changed |= tryFoldClamp(MI);
2210 }
2211
2212 Changed |= tryOptimizeAGPRPhis(*MBB);
2213 }
2214
2215 return Changed;
2216}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
aarch64 promote const
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
Module * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
#define DEBUG_TYPE
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
Class for arbitrary precision integers.
Definition: APInt.h:76
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:261
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:549
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:752
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:543
bool isRegSequence() const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:475
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:676
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo)
Does this operand support only inlinable literals?
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1395
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
char & SIFoldOperandsID
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.