LLVM 23.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarryInsts())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 bool tryConstantFoldOp(MachineInstr *MI) const;
246 bool tryFoldCndMask(MachineInstr &MI) const;
247 bool tryFoldZeroHighBits(MachineInstr &MI) const;
248 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
249
250 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
251 bool tryFoldFoldableCopy(MachineInstr &MI,
252 MachineOperand *&CurrentKnownM0Val) const;
253
254 const MachineOperand *isClamp(const MachineInstr &MI) const;
255 bool tryFoldClamp(MachineInstr &MI);
256
257 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
258 bool tryFoldOMod(MachineInstr &MI);
259 bool tryFoldRegSequence(MachineInstr &MI);
260 bool tryFoldPhiAGPR(MachineInstr &MI);
261 bool tryFoldLoad(MachineInstr &MI);
262
263 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
264
265public:
266 SIFoldOperandsImpl() = default;
267
268 bool run(MachineFunction &MF);
269};
270
271class SIFoldOperandsLegacy : public MachineFunctionPass {
272public:
273 static char ID;
274
275 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
276
277 bool runOnMachineFunction(MachineFunction &MF) override {
278 if (skipFunction(MF.getFunction()))
279 return false;
280 return SIFoldOperandsImpl().run(MF);
281 }
282
283 StringRef getPassName() const override { return "SI Fold Operands"; }
284
285 void getAnalysisUsage(AnalysisUsage &AU) const override {
286 AU.setPreservesCFG();
288 }
289
290 MachineFunctionProperties getRequiredProperties() const override {
291 return MachineFunctionProperties().setIsSSA();
292 }
293};
294
295} // End anonymous namespace.
296
297INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
298 false)
299
300char SIFoldOperandsLegacy::ID = 0;
301
302char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
303
306 const MachineOperand &MO) {
307 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
308 if (const TargetRegisterClass *SubRC =
309 TRI.getSubRegisterClass(RC, MO.getSubReg()))
310 RC = SubRC;
311 return RC;
312}
313
314// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
315static unsigned macToMad(unsigned Opc) {
316 switch (Opc) {
317 case AMDGPU::V_MAC_F32_e64:
318 return AMDGPU::V_MAD_F32_e64;
319 case AMDGPU::V_MAC_F16_e64:
320 return AMDGPU::V_MAD_F16_e64;
321 case AMDGPU::V_FMAC_F32_e64:
322 return AMDGPU::V_FMA_F32_e64;
323 case AMDGPU::V_FMAC_F16_e64:
324 return AMDGPU::V_FMA_F16_gfx9_e64;
325 case AMDGPU::V_FMAC_F16_t16_e64:
326 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
327 case AMDGPU::V_FMAC_F16_fake16_e64:
328 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
329 case AMDGPU::V_FMAC_LEGACY_F32_e64:
330 return AMDGPU::V_FMA_LEGACY_F32_e64;
331 case AMDGPU::V_FMAC_F64_e64:
332 return AMDGPU::V_FMA_F64_e64;
333 }
334 return AMDGPU::INSTRUCTION_LIST_END;
335}
336
337// TODO: Add heuristic that the frame index might not fit in the addressing mode
338// immediate offset to avoid materializing in loops.
339bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
340 const FoldableDef &OpToFold) const {
341 if (!OpToFold.isFI())
342 return false;
343
344 const unsigned Opc = UseMI.getOpcode();
345 switch (Opc) {
346 case AMDGPU::S_ADD_I32:
347 case AMDGPU::S_ADD_U32:
348 case AMDGPU::V_ADD_U32_e32:
349 case AMDGPU::V_ADD_CO_U32_e32:
350 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
351 // to insert the wave size shift at every point we use the index.
352 // TODO: Fix depending on visit order to fold immediates into the operand
353 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
354 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
355 case AMDGPU::V_ADD_U32_e64:
356 case AMDGPU::V_ADD_CO_U32_e64:
357 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
358 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
359 default:
360 break;
361 }
362
363 if (TII->isMUBUF(UseMI))
364 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
365 if (!TII->isFLATScratch(UseMI))
366 return false;
367
368 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
369 if (OpNo == SIdx)
370 return true;
371
372 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
373 return OpNo == VIdx && SIdx == -1;
374}
375
376/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
377///
378/// => %vgpr = V_ADD_U32 x, frameindex
379bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
380 Register DstReg, Register SrcReg, MachineInstr &MI) const {
381 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
382 MRI->hasOneNonDBGUse(SrcReg)) {
383 MachineInstr *Def = MRI->getVRegDef(SrcReg);
384 if (!Def || Def->getNumOperands() != 4)
385 return false;
386
387 MachineOperand *Src0 = &Def->getOperand(1);
388 MachineOperand *Src1 = &Def->getOperand(2);
389
390 // TODO: This is profitable with more operand types, and for more
391 // opcodes. But ultimately this is working around poor / nonexistent
392 // regbankselect.
393 if (!Src0->isFI() && !Src1->isFI())
394 return false;
395
396 if (Src0->isFI())
397 std::swap(Src0, Src1);
398
399 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
400 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
401 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
402 !Def->getOperand(3).isDead()) // Check if scc is dead
403 return false;
404
405 MachineBasicBlock *MBB = Def->getParent();
406 const DebugLoc &DL = Def->getDebugLoc();
407 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
408 MachineInstrBuilder Add =
409 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
410
411 if (Add->getDesc().getNumDefs() == 2) {
412 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
413 Add.addDef(CarryOutReg, RegState::Dead);
414 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
415 }
416
417 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
418 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
419 Add.addImm(0);
420
421 Def->eraseFromParent();
422 MI.eraseFromParent();
423 return true;
424 }
425
426 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
427
429 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
430 if (Liveness == MachineBasicBlock::LQR_Dead) {
431 // TODO: If src1 satisfies operand constraints, use vop3 version.
432 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
433 .add(*Src0)
434 .add(*Src1)
435 .setOperandDead(3) // implicit-def $vcc
436 .setMIFlags(Def->getFlags());
437 Def->eraseFromParent();
438 MI.eraseFromParent();
439 return true;
440 }
441 }
442
443 return false;
444}
445
447 return new SIFoldOperandsLegacy();
448}
449
450bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
451 unsigned UseOpNo,
452 int64_t ImmVal) const {
453 const uint64_t TSFlags = MI->getDesc().TSFlags;
454
455 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
456 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
457 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
458 return false;
459
460 const MachineOperand &Old = MI->getOperand(UseOpNo);
461 int OpNo = MI->getOperandNo(&Old);
462
463 unsigned Opcode = MI->getOpcode();
464 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
465 switch (OpType) {
466 default:
467 return false;
475 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
476 // two different constants.
477 if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
478 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
479 return false;
480 break;
481 }
482
483 return true;
484}
485
486bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
487 int64_t ImmVal) const {
488 MachineOperand &Old = MI->getOperand(UseOpNo);
489 unsigned Opcode = MI->getOpcode();
490 int OpNo = MI->getOperandNo(&Old);
491 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
492
493 // If the literal can be inlined as-is, apply it and short-circuit the
494 // tests below. The main motivation for this is to avoid unintuitive
495 // uses of opsel.
496 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
497 Old.ChangeToImmediate(ImmVal);
498 return true;
499 }
500
501 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
502 // op_sel in a way that allows an inline constant.
503 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
504 unsigned SrcIdx = ~0;
505 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
506 ModName = AMDGPU::OpName::src0_modifiers;
507 SrcIdx = 0;
508 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
509 ModName = AMDGPU::OpName::src1_modifiers;
510 SrcIdx = 1;
511 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
512 ModName = AMDGPU::OpName::src2_modifiers;
513 SrcIdx = 2;
514 }
515 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
516 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
517 MachineOperand &Mod = MI->getOperand(ModIdx);
518 unsigned ModVal = Mod.getImm();
519
520 uint16_t ImmLo =
521 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
522 uint16_t ImmHi =
523 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
524 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
525 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
526
527 // Helper function that attempts to inline the given value with a newly
528 // chosen opsel pattern.
529 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
530 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
531 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
532 Old.ChangeToImmediate(Imm);
533 return true;
534 }
535
536 // Try to shuffle the halves around and leverage opsel to get an inline
537 // constant.
538 uint16_t Lo = static_cast<uint16_t>(Imm);
539 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
540 if (Lo == Hi) {
541 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
542 Mod.setImm(NewModVal);
544 return true;
545 }
546
547 if (static_cast<int16_t>(Lo) < 0) {
548 int32_t SExt = static_cast<int16_t>(Lo);
549 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
550 Mod.setImm(NewModVal);
551 Old.ChangeToImmediate(SExt);
552 return true;
553 }
554 }
555
556 // This check is only useful for integer instructions
557 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
558 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
559 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
560 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
561 return true;
562 }
563 }
564 } else {
565 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
566 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
567 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
568 Old.ChangeToImmediate(Swapped);
569 return true;
570 }
571 }
572
573 return false;
574 };
575
576 if (tryFoldToInline(Imm))
577 return true;
578
579 // Replace integer addition by subtraction and vice versa if it allows
580 // folding the immediate to an inline constant.
581 //
582 // We should only ever get here for SrcIdx == 1 due to canonicalization
583 // earlier in the pipeline, but we double-check here to be safe / fully
584 // general.
585 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
586 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
587 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
588 unsigned ClampIdx =
589 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
590 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
591
592 if (!Clamp) {
593 uint16_t NegLo = -static_cast<uint16_t>(Imm);
594 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
595 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
596
597 if (tryFoldToInline(NegImm)) {
598 unsigned NegOpcode =
599 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
600 MI->setDesc(TII->get(NegOpcode));
601 return true;
602 }
603 }
604 }
605
606 return false;
607}
608
609bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
610 MachineInstr *MI = Fold.UseMI;
611 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
612 assert(Old.isReg());
613
614 std::optional<int64_t> ImmVal;
615 if (Fold.isImm())
616 ImmVal = Fold.Def.getEffectiveImmVal();
617
618 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
619 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
620 return true;
621
622 // We can't represent the candidate as an inline constant. Try as a literal
623 // with the original opsel, checking constant bus limitations.
624 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
625 int OpNo = MI->getOperandNo(&Old);
626 if (!TII->isOperandLegal(*MI, OpNo, &New))
627 return false;
628 Old.ChangeToImmediate(*ImmVal);
629 return true;
630 }
631
632 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
633 MachineBasicBlock *MBB = MI->getParent();
634 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
635 if (Liveness != MachineBasicBlock::LQR_Dead) {
636 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
637 return false;
638 }
639
640 int Op32 = Fold.ShrinkOpcode;
641 MachineOperand &Dst0 = MI->getOperand(0);
642 MachineOperand &Dst1 = MI->getOperand(1);
643 assert(Dst0.isDef() && Dst1.isDef());
644
645 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
646
647 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
648 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
649
650 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
651
652 if (HaveNonDbgCarryUse) {
653 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
654 Dst1.getReg())
655 .addReg(AMDGPU::VCC, RegState::Kill);
656 }
657
658 // Keep the old instruction around to avoid breaking iterators, but
659 // replace it with a dummy instruction to remove uses.
660 //
661 // FIXME: We should not invert how this pass looks at operands to avoid
662 // this. Should track set of foldable movs instead of looking for uses
663 // when looking at a use.
664 Dst0.setReg(NewReg0);
665 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
666 MI->removeOperand(I);
667 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
668
669 if (Fold.Commuted)
670 TII->commuteInstruction(*Inst32, false);
671 return true;
672 }
673
674 assert(!Fold.needsShrink() && "not handled");
675
676 if (ImmVal) {
677 if (Old.isTied()) {
678 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
679 if (NewMFMAOpc == -1)
680 return false;
681 MI->setDesc(TII->get(NewMFMAOpc));
682 MI->untieRegOperand(0);
683 const MCInstrDesc &MCID = MI->getDesc();
684 for (unsigned I = 0; I < MI->getNumDefs(); ++I)
686 MI->getOperand(I).setIsEarlyClobber(true);
687 }
688
689 // TODO: Should we try to avoid adding this to the candidate list?
690 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
691 int OpNo = MI->getOperandNo(&Old);
692 if (!TII->isOperandLegal(*MI, OpNo, &New))
693 return false;
694
695 Old.ChangeToImmediate(*ImmVal);
696 return true;
697 }
698
699 if (Fold.isGlobal()) {
700 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
701 Fold.Def.OpToFold->getOffset(),
702 Fold.Def.OpToFold->getTargetFlags());
703 return true;
704 }
705
706 if (Fold.isFI()) {
707 Old.ChangeToFrameIndex(Fold.getFI());
708 return true;
709 }
710
711 MachineOperand *New = Fold.Def.OpToFold;
712
713 // Verify the register is compatible with the operand.
714 if (const TargetRegisterClass *OpRC =
715 TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
716 const TargetRegisterClass *NewRC =
717 TRI->getRegClassForReg(*MRI, New->getReg());
718
719 const TargetRegisterClass *ConstrainRC = OpRC;
720 if (New->getSubReg()) {
721 ConstrainRC =
722 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
723
724 if (!ConstrainRC)
725 return false;
726 }
727
728 if (New->getReg().isVirtual() &&
729 !MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
730 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
731 << TRI->getRegClassName(ConstrainRC) << '\n');
732 return false;
733 }
734 }
735
736 // Rework once the VS_16 register class is updated to include proper
737 // 16-bit SGPRs instead of 32-bit ones.
738 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
739 Old.setSubReg(AMDGPU::NoSubRegister);
740 if (New->getReg().isPhysical()) {
741 Old.substPhysReg(New->getReg(), *TRI);
742 } else {
743 Register OldReg = Old.getReg();
744 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
745 Old.setIsUndef(New->isUndef());
746
747 // If MI is in a BUNDLE, also update header's matching implicit use.
748 if (MI->isBundledWithPred()) {
749 MachineInstr &Header = *getBundleStart(MI->getIterator());
750 for (MachineOperand &MO : Header.operands()) {
751 if (MO.getReg() == OldReg) {
752 MO.setReg(New->getReg());
753 MO.setSubReg(New->getSubReg());
754 }
755 }
756 }
757 }
758 return true;
759}
760
762 FoldCandidate &&Entry) {
763 // Skip additional folding on the same operand.
764 for (FoldCandidate &Fold : FoldList)
765 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
766 return;
767 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
768 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
769 FoldList.push_back(Entry);
770}
771
773 MachineInstr *MI, unsigned OpNo,
774 const FoldableDef &FoldOp,
775 bool Commuted = false, int ShrinkOp = -1) {
776 appendFoldCandidate(FoldList,
777 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
778}
779
780// Returns true if the instruction is a packed F32 instruction and the
781// corresponding scalar operand reads 32 bits and replicates the bits to both
782// channels.
784 const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) {
785 if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
786 return false;
787 const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
789}
790
791// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
792// literal) and replicates the bits to both channels. Therefore, if the hi and
793// lo are not same, we can't fold it.
795 const FoldableDef &OpToFold) {
796 assert(OpToFold.isImm() && "Expected immediate operand");
797 uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
798 uint32_t Lo = Lo_32(ImmVal);
799 uint32_t Hi = Hi_32(ImmVal);
800 return Lo == Hi;
801}
802
803bool SIFoldOperandsImpl::tryAddToFoldList(
804 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
805 const FoldableDef &OpToFold) const {
806 const unsigned Opc = MI->getOpcode();
807
808 auto tryToFoldAsFMAAKorMK = [&]() {
809 if (!OpToFold.isImm())
810 return false;
811
812 const bool TryAK = OpNo == 3;
813 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
814 MI->setDesc(TII->get(NewOpc));
815
816 // We have to fold into operand which would be Imm not into OpNo.
817 bool FoldAsFMAAKorMK =
818 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
819 if (FoldAsFMAAKorMK) {
820 // Untie Src2 of fmac.
821 MI->untieRegOperand(3);
822 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
823 if (OpNo == 1) {
824 MachineOperand &Op1 = MI->getOperand(1);
825 MachineOperand &Op2 = MI->getOperand(2);
826 Register OldReg = Op1.getReg();
827 // Operand 2 might be an inlinable constant
828 if (Op2.isImm()) {
829 Op1.ChangeToImmediate(Op2.getImm());
830 Op2.ChangeToRegister(OldReg, false);
831 } else {
832 Op1.setReg(Op2.getReg());
833 Op2.setReg(OldReg);
834 }
835 }
836 return true;
837 }
838 MI->setDesc(TII->get(Opc));
839 return false;
840 };
841
842 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
843 if (!IsLegal && OpToFold.isImm()) {
844 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
845 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
846 }
847
848 if (!IsLegal) {
849 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
850 unsigned NewOpc = macToMad(Opc);
851 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
852 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
853 // to fold the operand.
854 MI->setDesc(TII->get(NewOpc));
855 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
856 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
857 if (AddOpSel)
858 MI->addOperand(MachineOperand::CreateImm(0));
859 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
860 if (FoldAsMAD) {
861 MI->untieRegOperand(OpNo);
862 return true;
863 }
864 if (AddOpSel)
865 MI->removeOperand(MI->getNumExplicitOperands() - 1);
866 MI->setDesc(TII->get(Opc));
867 }
868
869 // Special case for s_fmac_f32 if we are trying to fold into Src2.
870 // By transforming into fmaak we can untie Src2 and make folding legal.
871 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
872 if (tryToFoldAsFMAAKorMK())
873 return true;
874 }
875
876 // Special case for s_setreg_b32
877 if (OpToFold.isImm()) {
878 unsigned ImmOpc = 0;
879 if (Opc == AMDGPU::S_SETREG_B32)
880 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
881 else if (Opc == AMDGPU::S_SETREG_B32_mode)
882 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
883 if (ImmOpc) {
884 MI->setDesc(TII->get(ImmOpc));
885 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
886 return true;
887 }
888 }
889
890 // Operand is not legal, so try to commute the instruction to
891 // see if this makes it possible to fold.
892 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
893 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
894 if (!CanCommute)
895 return false;
896
897 MachineOperand &Op = MI->getOperand(OpNo);
898 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
899
900 // One of operands might be an Imm operand, and OpNo may refer to it after
901 // the call of commuteInstruction() below. Such situations are avoided
902 // here explicitly as OpNo must be a register operand to be a candidate
903 // for memory folding.
904 if (!Op.isReg() || !CommutedOp.isReg())
905 return false;
906
907 // The same situation with an immediate could reproduce if both inputs are
908 // the same register.
909 if (Op.isReg() && CommutedOp.isReg() &&
910 (Op.getReg() == CommutedOp.getReg() &&
911 Op.getSubReg() == CommutedOp.getSubReg()))
912 return false;
913
914 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
915 return false;
916
917 int Op32 = -1;
918 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
919 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
920 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
921 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
922 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
923 return false;
924 }
925
926 // Verify the other operand is a VGPR, otherwise we would violate the
927 // constant bus restriction.
928 MachineOperand &OtherOp = MI->getOperand(OpNo);
929 if (!OtherOp.isReg() ||
930 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
931 return false;
932
933 assert(MI->getOperand(1).isDef());
934
935 // Make sure to get the 32-bit version of the commuted opcode.
936 unsigned MaybeCommutedOpc = MI->getOpcode();
937 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
938 }
939
940 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
941 Op32);
942 return true;
943 }
944
945 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
946 // By changing into fmamk we can untie Src2.
947 // If folding for Src0 happens first and it is identical operand to Src1 we
948 // should avoid transforming into fmamk which requires commuting as it would
949 // cause folding into Src1 to fail later on due to wrong OpNo used.
950 if (Opc == AMDGPU::S_FMAC_F32 &&
951 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
952 if (tryToFoldAsFMAAKorMK())
953 return true;
954 }
955
956 // Special case for PK_F32 instructions if we are trying to fold an imm to
957 // src0 or src1.
958 if (OpToFold.isImm() &&
961 return false;
962
963 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
964 return true;
965}
966
967bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
968 const MachineOperand &UseMO) const {
969 // Operands of SDWA instructions must be registers.
970 return !TII->isSDWA(MI);
971}
972
974 const MachineRegisterInfo &MRI,
975 Register SrcReg) {
976 MachineOperand *Sub = nullptr;
977 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
978 SubDef && TII.isFoldableCopy(*SubDef);
979 SubDef = MRI.getVRegDef(Sub->getReg())) {
980 unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
981 MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
982
983 if (SrcOp.isImm())
984 return &SrcOp;
985 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
986 break;
987 Sub = &SrcOp;
988 // TODO: Support compose
989 if (SrcOp.getSubReg())
990 break;
991 }
992
993 return Sub;
994}
995
996const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
997 MachineInstr &RegSeq,
998 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
999
1000 assert(RegSeq.isRegSequence());
1001
1002 const TargetRegisterClass *RC = nullptr;
1003
1004 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
1005 MachineOperand &SrcOp = RegSeq.getOperand(I);
1006 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
1007
1008 // Only accept reg_sequence with uniform reg class inputs for simplicity.
1009 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
1010 if (!RC)
1011 RC = OpRC;
1012 else if (!TRI->getCommonSubClass(RC, OpRC))
1013 return nullptr;
1014
1015 if (SrcOp.getSubReg()) {
1016 // TODO: Handle subregister compose
1017 Defs.emplace_back(&SrcOp, SubRegIdx);
1018 continue;
1019 }
1020
1021 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
1022 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
1023 Defs.emplace_back(DefSrc, SubRegIdx);
1024 continue;
1025 }
1026
1027 Defs.emplace_back(&SrcOp, SubRegIdx);
1028 }
1029
1030 return RC;
1031}
1032
1033// Find a def of the UseReg, check if it is a reg_sequence and find initializers
1034// for each subreg, tracking it to an immediate if possible. Returns the
1035// register class of the inputs on success.
1036const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
1037 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
1038 Register UseReg) const {
1039 MachineInstr *Def = MRI->getVRegDef(UseReg);
1040 if (!Def || !Def->isRegSequence())
1041 return nullptr;
1042
1043 return getRegSeqInit(*Def, Defs);
1044}
1045
1046std::pair<int64_t, const TargetRegisterClass *>
1047SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1049 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1050 if (!SrcRC)
1051 return {};
1052
1053 bool TryToMatchSplat64 = false;
1054
1055 int64_t Imm;
1056 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1057 const MachineOperand *Op = Defs[I].first;
1058 if (!Op->isImm())
1059 return {};
1060
1061 int64_t SubImm = Op->getImm();
1062 if (!I) {
1063 Imm = SubImm;
1064 continue;
1065 }
1066
1067 if (Imm != SubImm) {
1068 if (I == 1 && (E & 1) == 0) {
1069 // If we have an even number of inputs, there's a chance this is a
1070 // 64-bit element splat broken into 32-bit pieces.
1071 TryToMatchSplat64 = true;
1072 break;
1073 }
1074
1075 return {}; // Can only fold splat constants
1076 }
1077 }
1078
1079 if (!TryToMatchSplat64)
1080 return {Defs[0].first->getImm(), SrcRC};
1081
1082 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1083 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1084 int64_t SplatVal64;
1085 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1086 const MachineOperand *Op0 = Defs[I].first;
1087 const MachineOperand *Op1 = Defs[I + 1].first;
1088
1089 if (!Op0->isImm() || !Op1->isImm())
1090 return {};
1091
1092 unsigned SubReg0 = Defs[I].second;
1093 unsigned SubReg1 = Defs[I + 1].second;
1094
1095 // Assume we're going to generally encounter reg_sequences with sorted
1096 // subreg indexes, so reject any that aren't consecutive.
1097 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1098 TRI->getChannelFromSubReg(SubReg1))
1099 return {};
1100
1101 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1102 if (I == 0)
1103 SplatVal64 = MergedVal;
1104 else if (SplatVal64 != MergedVal)
1105 return {};
1106 }
1107
1108 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1109 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1110
1111 return {SplatVal64, RC64};
1112}
1113
1114bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1115 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1116 const TargetRegisterClass *SplatRC) const {
1117 const MCInstrDesc &Desc = UseMI->getDesc();
1118 if (UseOpIdx >= Desc.getNumOperands())
1119 return false;
1120
1121 // Filter out unhandled pseudos.
1122 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1123 return false;
1124
1125 int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]);
1126 if (RCID == -1)
1127 return false;
1128
1129 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1130
1131 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1132 // have the same bits. These are the only cases where a splat has the same
1133 // interpretation for 32-bit and 64-bit splats.
1134 if (SplatVal != 0 && SplatVal != -1) {
1135 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1136 // operand will be AReg_128, and we want to check if it's compatible with an
1137 // AReg_32 constant.
1138 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1139 switch (OpTy) {
1144 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1145 break;
1149 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1150 break;
1151 default:
1152 return false;
1153 }
1154
1155 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1156 return false;
1157 }
1158
1159 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1160 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1161 return false;
1162
1163 return true;
1164}
1165
1166bool SIFoldOperandsImpl::tryToFoldACImm(
1167 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1168 SmallVectorImpl<FoldCandidate> &FoldList) const {
1169 const MCInstrDesc &Desc = UseMI->getDesc();
1170 if (UseOpIdx >= Desc.getNumOperands())
1171 return false;
1172
1173 // Filter out unhandled pseudos.
1174 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1175 return false;
1176
1177 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1180 return false;
1181 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1182 return true;
1183 }
1184
1185 return false;
1186}
1187
1188void SIFoldOperandsImpl::foldOperand(
1189 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1190 SmallVectorImpl<FoldCandidate> &FoldList,
1191 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1192 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1193
1194 if (!isUseSafeToFold(*UseMI, *UseOp))
1195 return;
1196
1197 // FIXME: Fold operands with subregs.
1198 if (UseOp->isReg() && OpToFold.isReg()) {
1199 if (UseOp->isImplicit())
1200 return;
1201 // Allow folding from SGPRs to 16-bit VGPRs.
1202 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1203 (UseOp->getSubReg() != AMDGPU::lo16 ||
1204 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1205 return;
1206 }
1207
1208 // Special case for REG_SEQUENCE: We can't fold literals into
1209 // REG_SEQUENCE instructions, so we have to fold them into the
1210 // uses of REG_SEQUENCE.
1211 if (UseMI->isRegSequence()) {
1212 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1213 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1214
1215 int64_t SplatVal;
1216 const TargetRegisterClass *SplatRC;
1217 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1218
1219 // Grab the use operands first
1221 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1222 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1223 MachineOperand *RSUse = UsesToProcess[I];
1224 MachineInstr *RSUseMI = RSUse->getParent();
1225 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1226
1227 if (SplatRC) {
1228 if (RSUseMI->isCopy()) {
1229 Register DstReg = RSUseMI->getOperand(0).getReg();
1230 append_range(UsesToProcess,
1232 continue;
1233 }
1234 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1235 FoldableDef SplatDef(SplatVal, SplatRC);
1236 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1237 continue;
1238 }
1239 }
1240
1241 // TODO: Handle general compose
1242 if (RSUse->getSubReg() != RegSeqDstSubReg)
1243 continue;
1244
1245 // FIXME: We should avoid recursing here. There should be a cleaner split
1246 // between the in-place mutations and adding to the fold list.
1247 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1248 CopiesToReplace);
1249 }
1250
1251 return;
1252 }
1253
1254 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1255 return;
1256
1257 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1258 // Verify that this is a stack access.
1259 // FIXME: Should probably use stack pseudos before frame lowering.
1260
1261 if (TII->isMUBUF(*UseMI)) {
1262 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1263 MFI->getScratchRSrcReg())
1264 return;
1265
1266 // Ensure this is either relative to the current frame or the current
1267 // wave.
1268 MachineOperand &SOff =
1269 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1270 if (!SOff.isImm() || SOff.getImm() != 0)
1271 return;
1272 }
1273
1274 const unsigned Opc = UseMI->getOpcode();
1275 if (TII->isFLATScratch(*UseMI) &&
1276 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1277 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1278 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1279 unsigned CPol =
1280 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1281 if ((CPol & AMDGPU::CPol::SCAL) &&
1283 return;
1284
1285 UseMI->setDesc(TII->get(NewOpc));
1286 }
1287
1288 // A frame index will resolve to a positive constant, so it should always be
1289 // safe to fold the addressing mode, even pre-GFX9.
1290 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1291
1292 return;
1293 }
1294
1295 bool FoldingImmLike =
1296 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1297
1298 if (FoldingImmLike && UseMI->isCopy()) {
1299 Register DestReg = UseMI->getOperand(0).getReg();
1300 Register SrcReg = UseMI->getOperand(1).getReg();
1301 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1302 assert(SrcReg.isVirtual());
1303
1304 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1305
1306 // Don't fold into a copy to a physical register with the same class. Doing
1307 // so would interfere with the register coalescer's logic which would avoid
1308 // redundant initializations.
1309 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1310 return;
1311
1312 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1313 // In order to fold immediates into copies, we need to change the copy to a
1314 // MOV. Find a compatible mov instruction with the value.
1315 for (unsigned MovOp :
1316 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1317 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1318 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1319 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1320 const MCInstrDesc &MovDesc = TII->get(MovOp);
1321 const TargetRegisterClass *MovDstRC =
1322 TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0]));
1323
1324 // Fold if the destination register class of the MOV instruction (ResRC)
1325 // is a superclass of (or equal to) the destination register class of the
1326 // COPY (DestRC). If this condition fails, folding would be illegal.
1327 if (!DestRC->hasSuperClassEq(MovDstRC))
1328 continue;
1329
1330 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1331
1332 int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]);
1333 if (RegClassID != -1) {
1334 const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
1335
1336 if (UseSubReg)
1337 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1338
1339 // FIXME: We should be able to directly check immediate operand legality
1340 // for all cases, but gfx908 hacks break.
1341 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1342 (!OpToFold.isImm() ||
1343 !TII->isImmOperandLegal(MovDesc, SrcIdx,
1344 *OpToFold.getEffectiveImmVal())))
1345 break;
1346
1347 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1348 break;
1349
1350 // FIXME: This is mutating the instruction only and deferring the actual
1351 // fold of the immediate
1352 } else {
1353 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1354 // immediate to verify. Technically we should always verify this, but it
1355 // only matters for these concrete cases.
1356 // TODO: Handle non-imm case if it's useful.
1357 if (!OpToFold.isImm() ||
1358 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1359 break;
1360 }
1361
1364 while (ImpOpI != ImpOpE) {
1365 MachineInstr::mop_iterator Tmp = ImpOpI;
1366 ImpOpI++;
1368 }
1369 UseMI->setDesc(MovDesc);
1370
1371 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1372 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1373 MachineOperand NewSrcOp(SrcOp);
1374 MachineFunction *MF = UseMI->getMF();
1375 UseMI->removeOperand(1);
1376 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1377 UseMI->addOperand(NewSrcOp); // src0
1378 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1379 UseOpIdx = SrcIdx;
1380 UseOp = &UseMI->getOperand(UseOpIdx);
1381 }
1382 CopiesToReplace.push_back(UseMI);
1383 break;
1384 }
1385
1386 // We failed to replace the copy, so give up.
1387 if (UseMI->getOpcode() == AMDGPU::COPY)
1388 return;
1389
1390 } else {
1391 if (UseMI->isCopy() && OpToFold.isReg() &&
1392 UseMI->getOperand(0).getReg().isVirtual() &&
1393 !UseMI->getOperand(1).getSubReg() &&
1394 OpToFold.DefMI->implicit_operands().empty()) {
1395 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1396 << *UseMI);
1397 unsigned Size = TII->getOpSize(*UseMI, 1);
1398 Register UseReg = OpToFold.getReg();
1400 unsigned SubRegIdx = OpToFold.getSubReg();
1401 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1402 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1403 // VS_16RegClass
1404 //
1405 // Excerpt from AMDGPUGenRegisterInfoEnums.inc
1406 // NoSubRegister, //0
1407 // hi16, // 1
1408 // lo16, // 2
1409 // sub0, // 3
1410 // ...
1411 // sub1, // 11
1412 // sub1_hi16, // 12
1413 // sub1_lo16, // 13
1414 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1415 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1416 TRI->isSGPRReg(*MRI, UseReg)) {
1417 // Produce the 32 bit subregister index to which the 16-bit subregister
1418 // is aligned.
1419 if (SubRegIdx > AMDGPU::sub1) {
1420 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1421 M |= M.getLane(M.getHighestLane() - 1);
1422 SmallVector<unsigned, 4> Indexes;
1423 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1424 Indexes);
1425 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1426 SubRegIdx = Indexes[0];
1427 // 32-bit registers do not have a sub0 index
1428 } else if (TII->getOpSize(*UseMI, 1) == 4)
1429 SubRegIdx = 0;
1430 else
1431 SubRegIdx = AMDGPU::sub0;
1432 }
1433 UseMI->getOperand(1).setSubReg(SubRegIdx);
1434 UseMI->getOperand(1).setIsKill(false);
1435 CopiesToReplace.push_back(UseMI);
1436 OpToFold.OpToFold->setIsKill(false);
1437
1438 // Remove kill flags as kills may now be out of order with uses.
1439 MRI->clearKillFlags(UseReg);
1440 if (foldCopyToAGPRRegSequence(UseMI))
1441 return;
1442 }
1443
1444 unsigned UseOpc = UseMI->getOpcode();
1445 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1446 (UseOpc == AMDGPU::V_READLANE_B32 &&
1447 (int)UseOpIdx ==
1448 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1449 // %vgpr = V_MOV_B32 imm
1450 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1451 // =>
1452 // %sgpr = S_MOV_B32 imm
1453 if (FoldingImmLike) {
1455 UseMI->getOperand(UseOpIdx).getReg(),
1456 *OpToFold.DefMI, *UseMI))
1457 return;
1458
1459 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1461
1462 if (OpToFold.isImm()) {
1464 *OpToFold.getEffectiveImmVal());
1465 } else if (OpToFold.isFI())
1466 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1467 else {
1468 assert(OpToFold.isGlobal());
1469 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1470 OpToFold.OpToFold->getOffset(),
1471 OpToFold.OpToFold->getTargetFlags());
1472 }
1473 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1474 return;
1475 }
1476
1477 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1479 UseMI->getOperand(UseOpIdx).getReg(),
1480 *OpToFold.DefMI, *UseMI))
1481 return;
1482
1483 // %vgpr = COPY %sgpr0
1484 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1485 // =>
1486 // %sgpr1 = COPY %sgpr0
1487 UseMI->setDesc(TII->get(AMDGPU::COPY));
1488 UseMI->getOperand(1).setReg(OpToFold.getReg());
1489 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1490 UseMI->getOperand(1).setIsKill(false);
1491 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1493 return;
1494 }
1495 }
1496
1497 const MCInstrDesc &UseDesc = UseMI->getDesc();
1498
1499 // Don't fold into target independent nodes. Target independent opcodes
1500 // don't have defined register classes.
1501 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1502 UseDesc.operands()[UseOpIdx].RegClass == -1)
1503 return;
1504 }
1505
1506 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1507 // to enable more folding opportunities. The shrink operands pass
1508 // already does this.
1509
1510 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1511}
1512
1513static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1515 switch (Opcode) {
1516 case AMDGPU::S_ADD_I32:
1517 case AMDGPU::S_ADD_U32:
1518 Result = LHS + RHS;
1519 return true;
1520 case AMDGPU::S_SUB_I32:
1521 case AMDGPU::S_SUB_U32:
1522 Result = LHS - RHS;
1523 return true;
1524 case AMDGPU::V_AND_B32_e64:
1525 case AMDGPU::V_AND_B32_e32:
1526 case AMDGPU::S_AND_B32:
1527 Result = LHS & RHS;
1528 return true;
1529 case AMDGPU::V_OR_B32_e64:
1530 case AMDGPU::V_OR_B32_e32:
1531 case AMDGPU::S_OR_B32:
1532 Result = LHS | RHS;
1533 return true;
1534 case AMDGPU::V_XOR_B32_e64:
1535 case AMDGPU::V_XOR_B32_e32:
1536 case AMDGPU::S_XOR_B32:
1537 Result = LHS ^ RHS;
1538 return true;
1539 case AMDGPU::S_XNOR_B32:
1540 Result = ~(LHS ^ RHS);
1541 return true;
1542 case AMDGPU::S_NAND_B32:
1543 Result = ~(LHS & RHS);
1544 return true;
1545 case AMDGPU::S_NOR_B32:
1546 Result = ~(LHS | RHS);
1547 return true;
1548 case AMDGPU::S_ANDN2_B32:
1549 Result = LHS & ~RHS;
1550 return true;
1551 case AMDGPU::S_ORN2_B32:
1552 Result = LHS | ~RHS;
1553 return true;
1554 case AMDGPU::V_LSHL_B32_e64:
1555 case AMDGPU::V_LSHL_B32_e32:
1556 case AMDGPU::S_LSHL_B32:
1557 // The instruction ignores the high bits for out of bounds shifts.
1558 Result = LHS << (RHS & 31);
1559 return true;
1560 case AMDGPU::V_LSHLREV_B32_e64:
1561 case AMDGPU::V_LSHLREV_B32_e32:
1562 Result = RHS << (LHS & 31);
1563 return true;
1564 case AMDGPU::V_LSHR_B32_e64:
1565 case AMDGPU::V_LSHR_B32_e32:
1566 case AMDGPU::S_LSHR_B32:
1567 Result = LHS >> (RHS & 31);
1568 return true;
1569 case AMDGPU::V_LSHRREV_B32_e64:
1570 case AMDGPU::V_LSHRREV_B32_e32:
1571 Result = RHS >> (LHS & 31);
1572 return true;
1573 case AMDGPU::V_ASHR_I32_e64:
1574 case AMDGPU::V_ASHR_I32_e32:
1575 case AMDGPU::S_ASHR_I32:
1576 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1577 return true;
1578 case AMDGPU::V_ASHRREV_I32_e64:
1579 case AMDGPU::V_ASHRREV_I32_e32:
1580 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1581 return true;
1582 default:
1583 return false;
1584 }
1585}
1586
1587static unsigned getMovOpc(bool IsScalar) {
1588 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1589}
1590
1591// Try to simplify operations with a constant that may appear after instruction
1592// selection.
1593// TODO: See if a frame index with a fixed offset can fold.
1594bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1595 if (!MI->allImplicitDefsAreDead())
1596 return false;
1597
1598 unsigned Opc = MI->getOpcode();
1599
1600 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1601 if (Src0Idx == -1)
1602 return false;
1603
1604 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1605 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1606
1607 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1608 Opc == AMDGPU::S_NOT_B32) &&
1609 Src0Imm) {
1610 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1611 TII->mutateAndCleanupImplicit(
1612 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1613 return true;
1614 }
1615
1616 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1617 if (Src1Idx == -1)
1618 return false;
1619
1620 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1621 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1622
1623 if (!Src0Imm && !Src1Imm)
1624 return false;
1625
1626 // and k0, k1 -> v_mov_b32 (k0 & k1)
1627 // or k0, k1 -> v_mov_b32 (k0 | k1)
1628 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1629 if (Src0Imm && Src1Imm) {
1630 int32_t NewImm;
1631 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1632 return false;
1633
1634 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1635
1636 // Be careful to change the right operand, src0 may belong to a different
1637 // instruction.
1638 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1639 MI->removeOperand(Src1Idx);
1640 TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR)));
1641 return true;
1642 }
1643
1644 // S_SUB_* is not commutable, so handle it before the commutability gate.
1645 // Only `x - 0 -> copy x` is valid; `0 - x` is a negation, not a copy.
1646 if (Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U32) {
1647 if (Src1Imm && static_cast<int32_t>(*Src1Imm) == 0) {
1648 // y = sub x, 0 => y = copy x
1649 MI->removeOperand(Src1Idx);
1650 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1651 return true;
1652 }
1653 return false;
1654 }
1655
1656 if (!MI->isCommutable())
1657 return false;
1658
1659 if (Src0Imm && !Src1Imm) {
1660 std::swap(Src0, Src1);
1661 std::swap(Src0Idx, Src1Idx);
1662 std::swap(Src0Imm, Src1Imm);
1663 }
1664
1665 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1666 if (Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_ADD_U32) {
1667 if (Src1Val == 0) {
1668 // y = add x, 0 => y = copy x
1669 MI->removeOperand(Src1Idx);
1670 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1671 return true;
1672 }
1673 return false;
1674 }
1675
1676 if (Opc == AMDGPU::V_OR_B32_e64 ||
1677 Opc == AMDGPU::V_OR_B32_e32 ||
1678 Opc == AMDGPU::S_OR_B32) {
1679 if (Src1Val == 0) {
1680 // y = or x, 0 => y = copy x
1681 MI->removeOperand(Src1Idx);
1682 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1683 } else if (Src1Val == -1) {
1684 // y = or x, -1 => y = v_mov_b32 -1
1685 MI->removeOperand(Src0Idx);
1686 TII->mutateAndCleanupImplicit(
1687 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1688 } else
1689 return false;
1690
1691 return true;
1692 }
1693
1694 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1695 Opc == AMDGPU::S_AND_B32) {
1696 if (Src1Val == 0) {
1697 // y = and x, 0 => y = v_mov_b32 0
1698 MI->removeOperand(Src0Idx);
1699 TII->mutateAndCleanupImplicit(
1700 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1701 } else if (Src1Val == -1) {
1702 // y = and x, -1 => y = copy x
1703 MI->removeOperand(Src1Idx);
1704 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1705 } else
1706 return false;
1707
1708 return true;
1709 }
1710
1711 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1712 Opc == AMDGPU::S_XOR_B32) {
1713 if (Src1Val == 0) {
1714 // y = xor x, 0 => y = copy x
1715 MI->removeOperand(Src1Idx);
1716 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1717 return true;
1718 }
1719 }
1720
1721 return false;
1722}
1723
1724// Try to fold an instruction into a simpler one
1725bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1726 unsigned Opc = MI.getOpcode();
1727 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1728 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1729 return false;
1730
1731 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1732 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1733 if (!Src1->isIdenticalTo(*Src0)) {
1734 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1735 if (!Src1Imm)
1736 return false;
1737
1738 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1739 if (!Src0Imm || *Src0Imm != *Src1Imm)
1740 return false;
1741 }
1742
1743 int Src1ModIdx =
1744 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1745 int Src0ModIdx =
1746 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1747 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1748 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1749 return false;
1750
1751 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1752 auto &NewDesc =
1753 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1754 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1755 if (Src2Idx != -1)
1756 MI.removeOperand(Src2Idx);
1757 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1758 if (Src1ModIdx != -1)
1759 MI.removeOperand(Src1ModIdx);
1760 if (Src0ModIdx != -1)
1761 MI.removeOperand(Src0ModIdx);
1762 TII->mutateAndCleanupImplicit(MI, NewDesc);
1763 LLVM_DEBUG(dbgs() << MI);
1764 return true;
1765}
1766
1767bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1768 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1769 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1770 return false;
1771
1772 std::optional<int64_t> Src0Imm =
1773 TII->getImmOrMaterializedImm(MI.getOperand(1));
1774 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1775 return false;
1776
1777 Register Src1 = MI.getOperand(2).getReg();
1778 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1779 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1780 return false;
1781
1782 Register Dst = MI.getOperand(0).getReg();
1783 MRI->replaceRegWith(Dst, Src1);
1784 if (!MI.getOperand(2).isKill())
1785 MRI->clearKillFlags(Src1);
1786 MI.eraseFromParent();
1787 return true;
1788}
1789
1790bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1791 const FoldableDef &OpToFold) const {
1792 // We need mutate the operands of new mov instructions to add implicit
1793 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1794 // this.
1795 SmallVector<MachineInstr *, 4> CopiesToReplace;
1797 MachineOperand &Dst = MI.getOperand(0);
1798 bool Changed = false;
1799
1800 if (OpToFold.isImm()) {
1801 for (auto &UseMI :
1802 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1803 // Folding the immediate may reveal operations that can be constant
1804 // folded or replaced with a copy. This can happen for example after
1805 // frame indices are lowered to constants or from splitting 64-bit
1806 // constants.
1807 //
1808 // We may also encounter cases where one or both operands are
1809 // immediates materialized into a register, which would ordinarily not
1810 // be folded due to multiple uses or operand constraints.
1811 if (tryConstantFoldOp(&UseMI)) {
1812 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1813 Changed = true;
1814 }
1815 }
1816 }
1817
1819 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1820 for (auto *U : UsesToProcess) {
1821 MachineInstr *UseMI = U->getParent();
1822
1823 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1824 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1825 CopiesToReplace);
1826 }
1827
1828 if (CopiesToReplace.empty() && FoldList.empty())
1829 return Changed;
1830
1831 MachineFunction *MF = MI.getMF();
1832 // Make sure we add EXEC uses to any new v_mov instructions created.
1833 for (MachineInstr *Copy : CopiesToReplace)
1834 Copy->addImplicitDefUseOperands(*MF);
1835
1836 SetVector<MachineInstr *> ConstantFoldCandidates;
1837 for (FoldCandidate &Fold : FoldList) {
1838 assert(!Fold.isReg() || Fold.Def.OpToFold);
1839 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1840 Register Reg = Fold.getReg();
1841 const MachineInstr *DefMI = Fold.Def.DefMI;
1842 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1843 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1844 continue;
1845 }
1846 if (updateOperand(Fold)) {
1847 // Clear kill flags.
1848 if (Fold.isReg()) {
1849 assert(Fold.Def.OpToFold && Fold.isReg());
1850 // FIXME: Probably shouldn't bother trying to fold if not an
1851 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1852 // copies.
1853 MRI->clearKillFlags(Fold.getReg());
1854 }
1855 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1856 << static_cast<int>(Fold.UseOpNo) << " of "
1857 << *Fold.UseMI);
1858
1859 if (Fold.isImm())
1860 ConstantFoldCandidates.insert(Fold.UseMI);
1861
1862 } else if (Fold.Commuted) {
1863 // Restoring instruction's original operand order if fold has failed.
1864 TII->commuteInstruction(*Fold.UseMI, false);
1865 }
1866 }
1867
1868 for (MachineInstr *MI : ConstantFoldCandidates) {
1869 if (tryConstantFoldOp(MI)) {
1870 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1871 Changed = true;
1872 }
1873 }
1874 return true;
1875}
1876
1877/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1878/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1879bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1880 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1881 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1882 // initializers right here, so we will rematerialize immediates and avoid
1883 // copies via different reg classes.
1884 const TargetRegisterClass *DefRC =
1885 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1886 if (!TRI->isAGPRClass(DefRC))
1887 return false;
1888
1889 Register UseReg = CopyMI->getOperand(1).getReg();
1890 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1891 if (!RegSeq || !RegSeq->isRegSequence())
1892 return false;
1893
1894 const DebugLoc &DL = CopyMI->getDebugLoc();
1895 MachineBasicBlock &MBB = *CopyMI->getParent();
1896
1897 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1898 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1899
1900 const TargetRegisterClass *UseRC =
1901 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1902
1903 // Value, subregindex for new REG_SEQUENCE
1905
1906 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1907 unsigned NumFoldable = 0;
1908
1909 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1910 MachineOperand &RegOp = RegSeq->getOperand(I);
1911 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1912
1913 if (RegOp.getSubReg()) {
1914 // TODO: Handle subregister compose
1915 NewDefs.emplace_back(&RegOp, SubRegIdx);
1916 continue;
1917 }
1918
1919 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1920 if (!Lookup)
1921 Lookup = &RegOp;
1922
1923 if (Lookup->isImm()) {
1924 // Check if this is an agpr_32 subregister.
1925 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1926 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1927 if (DestSuperRC &&
1928 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1929 ++NumFoldable;
1930 NewDefs.emplace_back(Lookup, SubRegIdx);
1931 continue;
1932 }
1933 }
1934
1935 const TargetRegisterClass *InputRC =
1936 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1937 : MRI->getRegClass(RegOp.getReg());
1938
1939 // TODO: Account for Lookup->getSubReg()
1940
1941 // If we can't find a matching super class, this is an SGPR->AGPR or
1942 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1943 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1944 // want to rewrite to copy to an intermediate VGPR class.
1945 const TargetRegisterClass *MatchRC =
1946 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1947 if (!MatchRC) {
1948 ++NumFoldable;
1949 NewDefs.emplace_back(&RegOp, SubRegIdx);
1950 continue;
1951 }
1952
1953 NewDefs.emplace_back(&RegOp, SubRegIdx);
1954 }
1955
1956 // Do not clone a reg_sequence and merely change the result register class.
1957 if (NumFoldable == 0)
1958 return false;
1959
1960 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1961 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1962 CopyMI->removeOperand(I);
1963
1964 for (auto [Def, DestSubIdx] : NewDefs) {
1965 if (!Def->isReg()) {
1966 // TODO: Should we use single write for each repeated value like in
1967 // register case?
1968 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1969 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1970 .add(*Def);
1971 B.addReg(Tmp);
1972 } else {
1973 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1974 Def->setIsKill(false);
1975
1976 Register &VGPRCopy = VGPRCopies[Src];
1977 if (!VGPRCopy) {
1978 const TargetRegisterClass *VGPRUseSubRC =
1979 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1980
1981 // We cannot build a reg_sequence out of the same registers, they
1982 // must be copied. Better do it here before copyPhysReg() created
1983 // several reads to do the AGPR->VGPR->AGPR copy.
1984
1985 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1986 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1987 // later, create a copy here and track if we already have such a copy.
1988 const TargetRegisterClass *SubRC =
1989 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1990 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1991 // TODO: Try to reconstrain class
1992 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1993 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1994 B.addReg(VGPRCopy);
1995 } else {
1996 // If it is already a VGPR, do not copy the register.
1997 B.add(*Def);
1998 }
1999 } else {
2000 B.addReg(VGPRCopy);
2001 }
2002 }
2003
2004 B.addImm(DestSubIdx);
2005 }
2006
2007 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
2008 return true;
2009}
2010
2011bool SIFoldOperandsImpl::tryFoldFoldableCopy(
2012 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
2013 Register DstReg = MI.getOperand(0).getReg();
2014 // Specially track simple redefs of m0 to the same value in a block, so we
2015 // can erase the later ones.
2016 if (DstReg == AMDGPU::M0) {
2017 MachineOperand &NewM0Val = MI.getOperand(1);
2018 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
2019 MI.eraseFromParent();
2020 return true;
2021 }
2022
2023 // We aren't tracking other physical registers
2024 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
2025 ? nullptr
2026 : &NewM0Val;
2027 return false;
2028 }
2029
2030 MachineOperand *OpToFoldPtr;
2031 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
2032 // Folding when any src_modifiers are non-zero is unsupported
2033 if (TII->hasAnyModifiersSet(MI))
2034 return false;
2035 OpToFoldPtr = &MI.getOperand(2);
2036 } else
2037 OpToFoldPtr = &MI.getOperand(1);
2038 MachineOperand &OpToFold = *OpToFoldPtr;
2039 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
2040
2041 // FIXME: We could also be folding things like TargetIndexes.
2042 if (!FoldingImm && !OpToFold.isReg())
2043 return false;
2044
2045 // Fold virtual registers and constant physical registers.
2046 if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
2047 !TRI->isConstantPhysReg(OpToFold.getReg()))
2048 return false;
2049
2050 // Prevent folding operands backwards in the function. For example,
2051 // the COPY opcode must not be replaced by 1 in this example:
2052 //
2053 // %3 = COPY %vgpr0; VGPR_32:%3
2054 // ...
2055 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2056 if (!DstReg.isVirtual())
2057 return false;
2058
2059 const TargetRegisterClass *DstRC =
2060 MRI->getRegClass(MI.getOperand(0).getReg());
2061
2062 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2063 // Can remove this code if proper 16-bit SGPRs are implemented
2064 // Example: Pre-peephole-opt
2065 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2066 // %32:sreg_32 = COPY %29:sgpr_lo16
2067 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2068 // Post-peephole-opt and DCE
2069 // %32:sreg_32 = COPY %16.lo16:sreg_32
2070 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2071 // After this transform
2072 // %32:sreg_32 = COPY %16:sreg_32
2073 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2074 // After the fold operands pass
2075 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2076 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2077 OpToFold.getSubReg()) {
2078 if (DstRC == &AMDGPU::SReg_32RegClass &&
2079 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2080 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2081 OpToFold.setSubReg(0);
2082 }
2083 }
2084
2085 // Fold copy to AGPR through reg_sequence
2086 // TODO: Handle with subregister extract
2087 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2088 if (foldCopyToAGPRRegSequence(&MI))
2089 return true;
2090 }
2091
2092 FoldableDef Def(OpToFold, DstRC);
2093 bool Changed = foldInstOperand(MI, Def);
2094
2095 // If we managed to fold all uses of this copy then we might as well
2096 // delete it now.
2097 // The only reason we need to follow chains of copies here is that
2098 // tryFoldRegSequence looks forward through copies before folding a
2099 // REG_SEQUENCE into its eventual users.
2100 auto *InstToErase = &MI;
2101 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2102 auto &SrcOp = InstToErase->getOperand(1);
2103 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2104 InstToErase->eraseFromParent();
2105 Changed = true;
2106 InstToErase = nullptr;
2107 if (!SrcReg || SrcReg.isPhysical())
2108 break;
2109 InstToErase = MRI->getVRegDef(SrcReg);
2110 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2111 break;
2112 }
2113
2114 if (InstToErase && InstToErase->isRegSequence() &&
2115 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2116 InstToErase->eraseFromParent();
2117 Changed = true;
2118 }
2119
2120 if (Changed)
2121 return true;
2122
2123 // Run this after foldInstOperand to avoid turning scalar additions into
2124 // vector additions when the result scalar result could just be folded into
2125 // the user(s).
2126 return OpToFold.isReg() &&
2127 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2128}
2129
2130// Clamp patterns are canonically selected to v_max_* instructions, so only
2131// handle them.
2132const MachineOperand *
2133SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2134 unsigned Op = MI.getOpcode();
2135 switch (Op) {
2136 case AMDGPU::V_MAX_F32_e64:
2137 case AMDGPU::V_MAX_F16_e64:
2138 case AMDGPU::V_MAX_F16_t16_e64:
2139 case AMDGPU::V_MAX_F16_fake16_e64:
2140 case AMDGPU::V_MAX_F64_e64:
2141 case AMDGPU::V_MAX_NUM_F64_e64:
2142 case AMDGPU::V_PK_MAX_F16:
2143 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2144 case AMDGPU::V_PK_MAX_NUM_BF16: {
2145 if (MI.mayRaiseFPException())
2146 return nullptr;
2147
2148 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2149 return nullptr;
2150
2151 // Make sure sources are identical.
2152 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2153 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2154 if (!Src0->isReg() || !Src1->isReg() ||
2155 Src0->getReg() != Src1->getReg() ||
2156 Src0->getSubReg() != Src1->getSubReg() ||
2157 Src0->getSubReg() != AMDGPU::NoSubRegister)
2158 return nullptr;
2159
2160 // Can't fold up if we have modifiers.
2161 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2162 return nullptr;
2163
2164 unsigned Src0Mods
2165 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2166 unsigned Src1Mods
2167 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2168
2169 // Having a 0 op_sel_hi would require swizzling the output in the source
2170 // instruction, which we can't do.
2171 unsigned UnsetMods =
2172 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2174 : 0u;
2175 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2176 return nullptr;
2177 return Src0;
2178 }
2179 default:
2180 return nullptr;
2181 }
2182}
2183
2184// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2185bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2186 const MachineOperand *ClampSrc = isClamp(MI);
2187 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2188 return false;
2189
2190 if (!ClampSrc->getReg().isVirtual())
2191 return false;
2192
2193 // Look through COPY. COPY only observed with True16.
2194 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2195 MachineInstr *Def =
2196 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2197
2198 // The type of clamp must be compatible.
2199 if (!SIInstrInfo::hasSameClamp(*Def, MI))
2200 return false;
2201
2202 if (Def->mayRaiseFPException())
2203 return false;
2204
2205 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2206 if (!DefClamp)
2207 return false;
2208
2209 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2210
2211 // Clamp is applied after omod, so it is OK if omod is set.
2212 DefClamp->setImm(1);
2213
2214 Register DefReg = Def->getOperand(0).getReg();
2215 Register MIDstReg = MI.getOperand(0).getReg();
2216 if (TRI->isSGPRReg(*MRI, DefReg)) {
2217 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2218 // instruction with a VGPR dst.
2219 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2220 MIDstReg)
2221 .addReg(DefReg);
2222 } else {
2223 MRI->replaceRegWith(MIDstReg, DefReg);
2224 }
2225 MI.eraseFromParent();
2226
2227 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2228 // instruction, so we might as well convert it to the more flexible VOP3-only
2229 // mad/fma form.
2230 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2231 Def->eraseFromParent();
2232
2233 return true;
2234}
2235
2236static int getOModValue(unsigned Opc, int64_t Val) {
2237 switch (Opc) {
2238 case AMDGPU::V_MUL_F64_e64:
2239 case AMDGPU::V_MUL_F64_pseudo_e64: {
2240 switch (Val) {
2241 case 0x3fe0000000000000: // 0.5
2242 return SIOutMods::DIV2;
2243 case 0x4000000000000000: // 2.0
2244 return SIOutMods::MUL2;
2245 case 0x4010000000000000: // 4.0
2246 return SIOutMods::MUL4;
2247 default:
2248 return SIOutMods::NONE;
2249 }
2250 }
2251 case AMDGPU::V_MUL_F32_e64: {
2252 switch (static_cast<uint32_t>(Val)) {
2253 case 0x3f000000: // 0.5
2254 return SIOutMods::DIV2;
2255 case 0x40000000: // 2.0
2256 return SIOutMods::MUL2;
2257 case 0x40800000: // 4.0
2258 return SIOutMods::MUL4;
2259 default:
2260 return SIOutMods::NONE;
2261 }
2262 }
2263 case AMDGPU::V_MUL_F16_e64:
2264 case AMDGPU::V_MUL_F16_t16_e64:
2265 case AMDGPU::V_MUL_F16_fake16_e64: {
2266 switch (static_cast<uint16_t>(Val)) {
2267 case 0x3800: // 0.5
2268 return SIOutMods::DIV2;
2269 case 0x4000: // 2.0
2270 return SIOutMods::MUL2;
2271 case 0x4400: // 4.0
2272 return SIOutMods::MUL4;
2273 default:
2274 return SIOutMods::NONE;
2275 }
2276 }
2277 default:
2278 llvm_unreachable("invalid mul opcode");
2279 }
2280}
2281
2282// FIXME: Does this really not support denormals with f16?
2283// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2284// handled, so will anything other than that break?
2285std::pair<const MachineOperand *, int>
2286SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2287 unsigned Op = MI.getOpcode();
2288 switch (Op) {
2289 case AMDGPU::V_MUL_F64_e64:
2290 case AMDGPU::V_MUL_F64_pseudo_e64:
2291 case AMDGPU::V_MUL_F32_e64:
2292 case AMDGPU::V_MUL_F16_t16_e64:
2293 case AMDGPU::V_MUL_F16_fake16_e64:
2294 case AMDGPU::V_MUL_F16_e64: {
2295 // If output denormals are enabled, omod is ignored.
2296 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2298 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2299 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2300 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2303 MI.mayRaiseFPException())
2304 return std::pair(nullptr, SIOutMods::NONE);
2305
2306 const MachineOperand *RegOp = nullptr;
2307 const MachineOperand *ImmOp = nullptr;
2308 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2309 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2310 if (Src0->isImm()) {
2311 ImmOp = Src0;
2312 RegOp = Src1;
2313 } else if (Src1->isImm()) {
2314 ImmOp = Src1;
2315 RegOp = Src0;
2316 } else
2317 return std::pair(nullptr, SIOutMods::NONE);
2318
2319 int OMod = getOModValue(Op, ImmOp->getImm());
2320 if (OMod == SIOutMods::NONE ||
2321 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2322 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2323 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2324 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2325 return std::pair(nullptr, SIOutMods::NONE);
2326
2327 return std::pair(RegOp, OMod);
2328 }
2329 case AMDGPU::V_ADD_F64_e64:
2330 case AMDGPU::V_ADD_F64_pseudo_e64:
2331 case AMDGPU::V_ADD_F32_e64:
2332 case AMDGPU::V_ADD_F16_e64:
2333 case AMDGPU::V_ADD_F16_t16_e64:
2334 case AMDGPU::V_ADD_F16_fake16_e64: {
2335 // If output denormals are enabled, omod is ignored.
2336 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2338 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2339 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2340 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2342 return std::pair(nullptr, SIOutMods::NONE);
2343
2344 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2345 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2346 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2347
2348 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2349 Src0->getSubReg() == Src1->getSubReg() &&
2350 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2351 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2352 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2353 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2354 return std::pair(Src0, SIOutMods::MUL2);
2355
2356 return std::pair(nullptr, SIOutMods::NONE);
2357 }
2358 default:
2359 return std::pair(nullptr, SIOutMods::NONE);
2360 }
2361}
2362
2363// FIXME: Does this need to check IEEE bit on function?
2364bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2365 const MachineOperand *RegOp;
2366 int OMod;
2367 std::tie(RegOp, OMod) = isOMod(MI);
2368 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2369 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2370 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2371 return false;
2372
2373 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2374 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2375 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2376 return false;
2377
2378 if (Def->mayRaiseFPException())
2379 return false;
2380
2381 // Clamp is applied after omod. If the source already has clamp set, don't
2382 // fold it.
2383 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2384 return false;
2385
2386 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2387
2388 DefOMod->setImm(OMod);
2389 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2390 // Kill flags can be wrong if we replaced a def inside a loop with a def
2391 // outside the loop.
2392 MRI->clearKillFlags(Def->getOperand(0).getReg());
2393 MI.eraseFromParent();
2394
2395 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2396 // instruction, so we might as well convert it to the more flexible VOP3-only
2397 // mad/fma form.
2398 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2399 Def->eraseFromParent();
2400
2401 return true;
2402}
2403
2404// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2405// instruction which can take an agpr. So far that means a store.
2406bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2407 assert(MI.isRegSequence());
2408 auto Reg = MI.getOperand(0).getReg();
2409
2410 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2411 !MRI->hasOneNonDBGUse(Reg))
2412 return false;
2413
2415 if (!getRegSeqInit(Defs, Reg))
2416 return false;
2417
2418 for (auto &[Op, SubIdx] : Defs) {
2419 if (!Op->isReg())
2420 return false;
2421 if (TRI->isAGPR(*MRI, Op->getReg()))
2422 continue;
2423 // Maybe this is a COPY from AREG
2424 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2425 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2426 return false;
2427 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2428 return false;
2429 }
2430
2431 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2432 MachineInstr *UseMI = Op->getParent();
2433 while (UseMI->isCopy() && !Op->getSubReg()) {
2434 Reg = UseMI->getOperand(0).getReg();
2435 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2436 return false;
2437 Op = &*MRI->use_nodbg_begin(Reg);
2438 UseMI = Op->getParent();
2439 }
2440
2441 if (Op->getSubReg())
2442 return false;
2443
2444 unsigned OpIdx = Op - &UseMI->getOperand(0);
2445 const MCInstrDesc &InstDesc = UseMI->getDesc();
2446 const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
2447 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2448 return false;
2449
2450 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2451 auto Dst = MRI->createVirtualRegister(NewDstRC);
2452 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2453 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2454
2455 for (auto &[Def, SubIdx] : Defs) {
2456 Def->setIsKill(false);
2457 if (TRI->isAGPR(*MRI, Def->getReg())) {
2458 RS.add(*Def);
2459 } else { // This is a copy
2460 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2461 SubDef->getOperand(1).setIsKill(false);
2462 RS.addReg(SubDef->getOperand(1).getReg(), {}, Def->getSubReg());
2463 }
2464 RS.addImm(SubIdx);
2465 }
2466
2467 Op->setReg(Dst);
2468 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2469 Op->setReg(Reg);
2470 RS->eraseFromParent();
2471 return false;
2472 }
2473
2474 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2475
2476 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2477 // in which case we can erase them all later in runOnMachineFunction.
2478 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2479 MI.eraseFromParent();
2480 return true;
2481}
2482
2483/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2484/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2485static bool isAGPRCopy(const SIRegisterInfo &TRI,
2486 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2487 Register &OutReg, unsigned &OutSubReg) {
2488 assert(Copy.isCopy());
2489
2490 const MachineOperand &CopySrc = Copy.getOperand(1);
2491 Register CopySrcReg = CopySrc.getReg();
2492 if (!CopySrcReg.isVirtual())
2493 return false;
2494
2495 // Common case: copy from AGPR directly, e.g.
2496 // %1:vgpr_32 = COPY %0:agpr_32
2497 if (TRI.isAGPR(MRI, CopySrcReg)) {
2498 OutReg = CopySrcReg;
2499 OutSubReg = CopySrc.getSubReg();
2500 return true;
2501 }
2502
2503 // Sometimes it can also involve two copies, e.g.
2504 // %1:vgpr_256 = COPY %0:agpr_256
2505 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2506 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2507 if (!CopySrcDef || !CopySrcDef->isCopy())
2508 return false;
2509
2510 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2511 Register OtherCopySrcReg = OtherCopySrc.getReg();
2512 if (!OtherCopySrcReg.isVirtual() ||
2513 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2514 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2515 !TRI.isAGPR(MRI, OtherCopySrcReg))
2516 return false;
2517
2518 OutReg = OtherCopySrcReg;
2519 OutSubReg = CopySrc.getSubReg();
2520 return true;
2521}
2522
2523// Try to hoist an AGPR to VGPR copy across a PHI.
2524// This should allow folding of an AGPR into a consumer which may support it.
2525//
2526// Example 1: LCSSA PHI
2527// loop:
2528// %1:vreg = COPY %0:areg
2529// exit:
2530// %2:vreg = PHI %1:vreg, %loop
2531// =>
2532// loop:
2533// exit:
2534// %1:areg = PHI %0:areg, %loop
2535// %2:vreg = COPY %1:areg
2536//
2537// Example 2: PHI with multiple incoming values:
2538// entry:
2539// %1:vreg = GLOBAL_LOAD(..)
2540// loop:
2541// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2542// %3:areg = COPY %2:vreg
2543// %4:areg = (instr using %3:areg)
2544// %5:vreg = COPY %4:areg
2545// =>
2546// entry:
2547// %1:vreg = GLOBAL_LOAD(..)
2548// %2:areg = COPY %1:vreg
2549// loop:
2550// %3:areg = PHI %2:areg, %entry, %X:areg,
2551// %4:areg = (instr using %3:areg)
2552bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2553 assert(PHI.isPHI());
2554
2555 Register PhiOut = PHI.getOperand(0).getReg();
2556 if (!TRI->isVGPR(*MRI, PhiOut))
2557 return false;
2558
2559 // Iterate once over all incoming values of the PHI to check if this PHI is
2560 // eligible, and determine the exact AGPR RC we'll target.
2561 const TargetRegisterClass *ARC = nullptr;
2562 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2563 MachineOperand &MO = PHI.getOperand(K);
2564 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2565 if (!Copy || !Copy->isCopy())
2566 continue;
2567
2568 Register AGPRSrc;
2569 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2570 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2571 continue;
2572
2573 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2574 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2575 CopyInRC = SubRC;
2576
2577 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2578 return false;
2579 ARC = CopyInRC;
2580 }
2581
2582 if (!ARC)
2583 return false;
2584
2585 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2586
2587 // Rewrite the PHI's incoming values to ARC.
2588 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2589 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2590 MachineOperand &MO = PHI.getOperand(K);
2591 Register Reg = MO.getReg();
2592
2594 MachineBasicBlock *InsertMBB = nullptr;
2595
2596 // Look at the def of Reg, ignoring all copies.
2597 unsigned CopyOpc = AMDGPU::COPY;
2598 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2599
2600 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2601 // the copy was single-use, it will be removed by DCE later.
2602 if (Def->isCopy()) {
2603 Register AGPRSrc;
2604 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2605 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2606 MO.setReg(AGPRSrc);
2607 MO.setSubReg(AGPRSubReg);
2608 continue;
2609 }
2610
2611 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2612 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2613 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2614 // is unlikely to be profitable.
2615 //
2616 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2617 MachineOperand &CopyIn = Def->getOperand(1);
2618 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2619 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2620 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2621 }
2622
2623 InsertMBB = Def->getParent();
2624 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2625 } else {
2626 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2627 InsertPt = InsertMBB->getFirstTerminator();
2628 }
2629
2630 Register NewReg = MRI->createVirtualRegister(ARC);
2631 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2632 TII->get(CopyOpc), NewReg)
2633 .addReg(Reg);
2634 MO.setReg(NewReg);
2635
2636 (void)MI;
2637 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2638 }
2639
2640 // Replace the PHI's result with a new register.
2641 Register NewReg = MRI->createVirtualRegister(ARC);
2642 PHI.getOperand(0).setReg(NewReg);
2643
2644 // COPY that new register back to the original PhiOut register. This COPY will
2645 // usually be folded out later.
2646 MachineBasicBlock *MBB = PHI.getParent();
2647 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2648 TII->get(AMDGPU::COPY), PhiOut)
2649 .addReg(NewReg);
2650
2651 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2652 return true;
2653}
2654
2655// Attempt to convert VGPR load to an AGPR load.
2656bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2657 assert(MI.mayLoad());
2658 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2659 return false;
2660
2661 MachineOperand &Def = MI.getOperand(0);
2662 if (!Def.isDef())
2663 return false;
2664
2665 Register DefReg = Def.getReg();
2666
2667 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2668 return false;
2669
2672 SmallVector<Register, 8> MoveRegs;
2673
2674 if (Users.empty())
2675 return false;
2676
2677 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2678 while (!Users.empty()) {
2679 const MachineInstr *I = Users.pop_back_val();
2680 if (!I->isCopy() && !I->isRegSequence())
2681 return false;
2682 Register DstReg = I->getOperand(0).getReg();
2683 // Physical registers may have more than one instruction definitions
2684 if (DstReg.isPhysical())
2685 return false;
2686 if (TRI->isAGPR(*MRI, DstReg))
2687 continue;
2688 MoveRegs.push_back(DstReg);
2689 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2690 Users.push_back(&U);
2691 }
2692
2693 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2694 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2695 if (!TII->isOperandLegal(MI, 0, &Def)) {
2696 MRI->setRegClass(DefReg, RC);
2697 return false;
2698 }
2699
2700 while (!MoveRegs.empty()) {
2701 Register Reg = MoveRegs.pop_back_val();
2702 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2703 }
2704
2705 LLVM_DEBUG(dbgs() << "Folded " << MI);
2706
2707 return true;
2708}
2709
2710// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2711// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2712// there's cases where it can create a lot more AGPR-AGPR copies, which are
2713// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2714//
2715// This function looks at all AGPR PHIs in a basic block and collects their
2716// operands. Then, it checks for register that are used more than once across
2717// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2718// having to create one VGPR temporary per use, which can get very messy if
2719// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2720// element).
2721//
2722// Example
2723// a:
2724// %in:agpr_256 = COPY %foo:vgpr_256
2725// c:
2726// %x:agpr_32 = ..
2727// b:
2728// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2729// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2730// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2731// =>
2732// a:
2733// %in:agpr_256 = COPY %foo:vgpr_256
2734// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2735// %tmp_agpr:agpr_32 = COPY %tmp
2736// c:
2737// %x:agpr_32 = ..
2738// b:
2739// %0:areg = PHI %tmp_agpr, %a, %x, %c
2740// %1:areg = PHI %tmp_agpr, %a, %y, %c
2741// %2:areg = PHI %tmp_agpr, %a, %z, %c
2742bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2743 // This is only really needed on GFX908 where AGPR-AGPR copies are
2744 // unreasonably difficult.
2745 if (ST->hasGFX90AInsts())
2746 return false;
2747
2748 // Look at all AGPR Phis and collect the register + subregister used.
2749 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2750 RegToMO;
2751
2752 for (auto &MI : MBB) {
2753 if (!MI.isPHI())
2754 break;
2755
2756 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2757 continue;
2758
2759 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2760 MachineOperand &PhiMO = MI.getOperand(K);
2761 if (!PhiMO.getSubReg())
2762 continue;
2763 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2764 }
2765 }
2766
2767 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2768 // a VGPR.
2769 bool Changed = false;
2770 for (const auto &[Entry, MOs] : RegToMO) {
2771 if (MOs.size() == 1)
2772 continue;
2773
2774 const auto [Reg, SubReg] = Entry;
2775 MachineInstr *Def = MRI->getVRegDef(Reg);
2776 MachineBasicBlock *DefMBB = Def->getParent();
2777
2778 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2779 // out.
2780 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2781 Register TempVGPR =
2782 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2783 MachineInstr *VGPRCopy =
2784 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2785 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2786 .addReg(Reg, /* flags */ {}, SubReg);
2787
2788 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2789 Register TempAGPR = MRI->createVirtualRegister(ARC);
2790 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2791 TII->get(AMDGPU::COPY), TempAGPR)
2792 .addReg(TempVGPR);
2793
2794 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2795 for (MachineOperand *MO : MOs) {
2796 MO->setReg(TempAGPR);
2797 MO->setSubReg(AMDGPU::NoSubRegister);
2798 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2799 }
2800
2801 Changed = true;
2802 }
2803
2804 return Changed;
2805}
2806
2807bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2808 this->MF = &MF;
2809 MRI = &MF.getRegInfo();
2810 ST = &MF.getSubtarget<GCNSubtarget>();
2811 TII = ST->getInstrInfo();
2812 TRI = &TII->getRegisterInfo();
2813 MFI = MF.getInfo<SIMachineFunctionInfo>();
2814
2815 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2816 // correctly handle signed zeros.
2817 //
2818 // FIXME: Also need to check strictfp
2819 bool IsIEEEMode = MFI->getMode().IEEE;
2820
2821 bool Changed = false;
2822 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2823 MachineOperand *CurrentKnownM0Val = nullptr;
2824 for (auto &MI : make_early_inc_range(*MBB)) {
2825 Changed |= tryFoldCndMask(MI);
2826
2827 if (tryFoldZeroHighBits(MI)) {
2828 Changed = true;
2829 continue;
2830 }
2831
2832 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2833 Changed = true;
2834 continue;
2835 }
2836
2837 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2838 Changed = true;
2839 continue;
2840 }
2841
2842 if (MI.mayLoad() && tryFoldLoad(MI)) {
2843 Changed = true;
2844 continue;
2845 }
2846
2847 if (TII->isFoldableCopy(MI)) {
2848 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2849 continue;
2850 }
2851
2852 // Saw an unknown clobber of m0, so we no longer know what it is.
2853 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2854 CurrentKnownM0Val = nullptr;
2855
2856 // TODO: Omod might be OK if there is NSZ only on the source
2857 // instruction, and not the omod multiply.
2858 if (IsIEEEMode || !MI.getFlag(MachineInstr::FmNsz) || !tryFoldOMod(MI))
2859 Changed |= tryFoldClamp(MI);
2860 }
2861
2862 Changed |= tryOptimizeAGPRPhis(*MBB);
2863 }
2864
2865 return Changed;
2866}
2867
2870 MFPropsModifier _(*this, MF);
2871
2872 bool Changed = SIFoldOperandsImpl().run(MF);
2873 if (!Changed) {
2874 return PreservedAnalyses::all();
2875 }
2877 PA.preserveSet<CFGAnalyses>();
2878 return PA;
2879}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(const FoldableDef &OpToFold)
static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:275
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
const HexagonRegisterInfo & getRegisterInfo() const
ArrayRef< MCOperandInfo > operands() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void clearFlag(MIFlag Flag)
clearFlag - Clear a MI flag.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
use_nodbg_iterator use_nodbg_begin(Register RegNo) const
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI bool hasOneNonDBGUser(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug instruction using the specified regis...
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool hasSameClamp(const MachineInstr &A, const MachineInstr &B)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:229
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:231
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:215
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:212
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:225
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:232
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:243
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:244
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:230
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:245
LLVM_READONLY int32_t getFlatScratchInstSSfromSV(uint32_t Opcode)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineBasicBlock::instr_iterator getBundleStart(MachineBasicBlock::instr_iterator I)
Returns an iterator to the first instruction in the bundle containing I.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.