LLVM  9.0.0svn
SIFoldOperands.cpp
Go to the documentation of this file.
1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// \file
8 //===----------------------------------------------------------------------===//
9 //
10 
11 #include "AMDGPU.h"
12 #include "AMDGPUSubtarget.h"
13 #include "SIInstrInfo.h"
14 #include "SIMachineFunctionInfo.h"
21 #include "llvm/Support/Debug.h"
24 
25 #define DEBUG_TYPE "si-fold-operands"
26 using namespace llvm;
27 
28 namespace {
29 
30 struct FoldCandidate {
32  union {
33  MachineOperand *OpToFold;
34  uint64_t ImmToFold;
35  int FrameIndexToFold;
36  };
37  int ShrinkOpcode;
38  unsigned char UseOpNo;
40  bool Commuted;
41 
42  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
43  bool Commuted_ = false,
44  int ShrinkOp = -1) :
45  UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
46  Kind(FoldOp->getType()),
47  Commuted(Commuted_) {
48  if (FoldOp->isImm()) {
49  ImmToFold = FoldOp->getImm();
50  } else if (FoldOp->isFI()) {
51  FrameIndexToFold = FoldOp->getIndex();
52  } else {
53  assert(FoldOp->isReg() || FoldOp->isGlobal());
54  OpToFold = FoldOp;
55  }
56  }
57 
58  bool isFI() const {
59  return Kind == MachineOperand::MO_FrameIndex;
60  }
61 
62  bool isImm() const {
63  return Kind == MachineOperand::MO_Immediate;
64  }
65 
66  bool isReg() const {
67  return Kind == MachineOperand::MO_Register;
68  }
69 
70  bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
71 
72  bool isCommuted() const {
73  return Commuted;
74  }
75 
76  bool needsShrink() const {
77  return ShrinkOpcode != -1;
78  }
79 
80  int getShrinkOpcode() const {
81  return ShrinkOpcode;
82  }
83 };
84 
85 class SIFoldOperands : public MachineFunctionPass {
86 public:
87  static char ID;
89  const SIInstrInfo *TII;
90  const SIRegisterInfo *TRI;
91  const GCNSubtarget *ST;
92  const SIMachineFunctionInfo *MFI;
93 
94  void foldOperand(MachineOperand &OpToFold,
96  int UseOpIdx,
98  SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
99 
100  void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
101 
102  const MachineOperand *isClamp(const MachineInstr &MI) const;
103  bool tryFoldClamp(MachineInstr &MI);
104 
105  std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
106  bool tryFoldOMod(MachineInstr &MI);
107 
108 public:
109  SIFoldOperands() : MachineFunctionPass(ID) {
111  }
112 
113  bool runOnMachineFunction(MachineFunction &MF) override;
114 
115  StringRef getPassName() const override { return "SI Fold Operands"; }
116 
117  void getAnalysisUsage(AnalysisUsage &AU) const override {
118  AU.setPreservesCFG();
120  }
121 };
122 
123 } // End anonymous namespace.
124 
125 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
126  "SI Fold Operands", false, false)
127 
128 char SIFoldOperands::ID = 0;
129 
131 
132 // Wrapper around isInlineConstant that understands special cases when
133 // instruction types are replaced during operand folding.
135  const MachineInstr &UseMI,
136  unsigned OpNo,
137  const MachineOperand &OpToFold) {
138  if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
139  return true;
140 
141  unsigned Opc = UseMI.getOpcode();
142  switch (Opc) {
143  case AMDGPU::V_MAC_F32_e64:
144  case AMDGPU::V_MAC_F16_e64:
145  case AMDGPU::V_FMAC_F32_e64: {
146  // Special case for mac. Since this is replaced with mad when folded into
147  // src2, we need to check the legality for the final instruction.
148  int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
149  if (static_cast<int>(OpNo) == Src2Idx) {
150  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
151  bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
152 
153  unsigned Opc = IsFMA ?
154  AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
155  const MCInstrDesc &MadDesc = TII->get(Opc);
156  return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
157  }
158  return false;
159  }
160  default:
161  return false;
162  }
163 }
164 
165 // TODO: Add heuristic that the frame index might not fit in the addressing mode
166 // immediate offset to avoid materializing in loops.
167 static bool frameIndexMayFold(const SIInstrInfo *TII,
168  const MachineInstr &UseMI,
169  int OpNo,
170  const MachineOperand &OpToFold) {
171  return OpToFold.isFI() &&
172  (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
173  OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
174 }
175 
177  return new SIFoldOperands();
178 }
179 
180 static bool updateOperand(FoldCandidate &Fold,
181  const SIInstrInfo &TII,
182  const TargetRegisterInfo &TRI,
183  const GCNSubtarget &ST) {
184  MachineInstr *MI = Fold.UseMI;
185  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
186  assert(Old.isReg());
187 
188  if (Fold.isImm()) {
189  if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
190  !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
191  AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold),
192  ST.hasInv2PiInlineImm())) {
193  // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
194  // already set.
195  unsigned Opcode = MI->getOpcode();
196  int OpNo = MI->getOperandNo(&Old);
197  int ModIdx = -1;
198  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
199  ModIdx = AMDGPU::OpName::src0_modifiers;
200  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
201  ModIdx = AMDGPU::OpName::src1_modifiers;
202  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
203  ModIdx = AMDGPU::OpName::src2_modifiers;
204  assert(ModIdx != -1);
205  ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
206  MachineOperand &Mod = MI->getOperand(ModIdx);
207  unsigned Val = Mod.getImm();
208  if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
209  return false;
210  // Only apply the following transformation if that operand requries
211  // a packed immediate.
212  switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
217  // If upper part is all zero we do not need op_sel_hi.
218  if (!isUInt<16>(Fold.ImmToFold)) {
219  if (!(Fold.ImmToFold & 0xffff)) {
220  Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
221  Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
222  Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
223  return true;
224  }
225  Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
226  Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
227  return true;
228  }
229  break;
230  default:
231  break;
232  }
233  }
234  }
235 
236  if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
237  MachineBasicBlock *MBB = MI->getParent();
238  auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
239  if (Liveness != MachineBasicBlock::LQR_Dead)
240  return false;
241 
243  int Op32 = Fold.getShrinkOpcode();
244  MachineOperand &Dst0 = MI->getOperand(0);
245  MachineOperand &Dst1 = MI->getOperand(1);
246  assert(Dst0.isDef() && Dst1.isDef());
247 
248  bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
249 
250  const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
251  unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
252 
253  MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
254 
255  if (HaveNonDbgCarryUse) {
256  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
257  .addReg(AMDGPU::VCC, RegState::Kill);
258  }
259 
260  // Keep the old instruction around to avoid breaking iterators, but
261  // replace it with a dummy instruction to remove uses.
262  //
263  // FIXME: We should not invert how this pass looks at operands to avoid
264  // this. Should track set of foldable movs instead of looking for uses
265  // when looking at a use.
266  Dst0.setReg(NewReg0);
267  for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
268  MI->RemoveOperand(I);
269  MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
270 
271  if (Fold.isCommuted())
272  TII.commuteInstruction(*Inst32, false);
273  return true;
274  }
275 
276  assert(!Fold.needsShrink() && "not handled");
277 
278  if (Fold.isImm()) {
279  Old.ChangeToImmediate(Fold.ImmToFold);
280  return true;
281  }
282 
283  if (Fold.isGlobal()) {
284  Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
285  Fold.OpToFold->getTargetFlags());
286  return true;
287  }
288 
289  if (Fold.isFI()) {
290  Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
291  return true;
292  }
293 
294  MachineOperand *New = Fold.OpToFold;
295  Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
296  Old.setIsUndef(New->isUndef());
297  return true;
298 }
299 
301  const MachineInstr *MI) {
302  for (auto Candidate : FoldList) {
303  if (Candidate.UseMI == MI)
304  return true;
305  }
306  return false;
307 }
308 
310  MachineInstr *MI, unsigned OpNo,
311  MachineOperand *OpToFold,
312  const SIInstrInfo *TII) {
313  if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
314  // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
315  unsigned Opc = MI->getOpcode();
316  if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
317  Opc == AMDGPU::V_FMAC_F32_e64) &&
318  (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
319  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
320  bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
321  unsigned NewOpc = IsFMA ?
322  AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
323 
324  // Check if changing this to a v_mad_{f16, f32} instruction will allow us
325  // to fold the operand.
326  MI->setDesc(TII->get(NewOpc));
327  bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
328  if (FoldAsMAD) {
329  MI->untieRegOperand(OpNo);
330  return true;
331  }
332  MI->setDesc(TII->get(Opc));
333  }
334 
335  // Special case for s_setreg_b32
336  if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
337  MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
338  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
339  return true;
340  }
341 
342  // If we are already folding into another operand of MI, then
343  // we can't commute the instruction, otherwise we risk making the
344  // other fold illegal.
345  if (isUseMIInFoldList(FoldList, MI))
346  return false;
347 
348  unsigned CommuteOpNo = OpNo;
349 
350  // Operand is not legal, so try to commute the instruction to
351  // see if this makes it possible to fold.
352  unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
353  unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
354  bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
355 
356  if (CanCommute) {
357  if (CommuteIdx0 == OpNo)
358  CommuteOpNo = CommuteIdx1;
359  else if (CommuteIdx1 == OpNo)
360  CommuteOpNo = CommuteIdx0;
361  }
362 
363 
364  // One of operands might be an Imm operand, and OpNo may refer to it after
365  // the call of commuteInstruction() below. Such situations are avoided
366  // here explicitly as OpNo must be a register operand to be a candidate
367  // for memory folding.
368  if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
369  !MI->getOperand(CommuteIdx1).isReg()))
370  return false;
371 
372  if (!CanCommute ||
373  !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
374  return false;
375 
376  if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
377  if ((Opc == AMDGPU::V_ADD_I32_e64 ||
378  Opc == AMDGPU::V_SUB_I32_e64 ||
379  Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
380  (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
382 
383  // Verify the other operand is a VGPR, otherwise we would violate the
384  // constant bus restriction.
385  unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
386  MachineOperand &OtherOp = MI->getOperand(OtherIdx);
387  if (!OtherOp.isReg() ||
388  !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
389  return false;
390 
391  assert(MI->getOperand(1).isDef());
392 
393  // Make sure to get the 32-bit version of the commuted opcode.
394  unsigned MaybeCommutedOpc = MI->getOpcode();
395  int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
396 
397  FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
398  Op32));
399  return true;
400  }
401 
402  TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
403  return false;
404  }
405 
406  FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
407  return true;
408  }
409 
410  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
411  return true;
412 }
413 
414 // If the use operand doesn't care about the value, this may be an operand only
415 // used for register indexing, in which case it is unsafe to fold.
416 static bool isUseSafeToFold(const SIInstrInfo *TII,
417  const MachineInstr &MI,
418  const MachineOperand &UseMO) {
419  return !UseMO.isUndef() && !TII->isSDWA(MI);
420  //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
421 }
422 
423 static bool tryToFoldACImm(const SIInstrInfo *TII,
424  const MachineOperand &OpToFold,
426  unsigned UseOpIdx,
427  SmallVectorImpl<FoldCandidate> &FoldList) {
428  const MCInstrDesc &Desc = UseMI->getDesc();
429  const MCOperandInfo *OpInfo = Desc.OpInfo;
430  if (!OpInfo || UseOpIdx >= Desc.getNumOperands())
431  return false;
432 
433  uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
436  return false;
437 
438  if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) {
439  UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
440  return true;
441  }
442 
443  if (!OpToFold.isReg())
444  return false;
445 
446  unsigned UseReg = OpToFold.getReg();
448  return false;
449 
450  if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
451  return FC.UseMI == UseMI; }) != FoldList.end())
452  return false;
453 
455  const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
456  if (!Def || !Def->isRegSequence())
457  return false;
458 
459  int64_t Imm;
461  for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
462  const MachineOperand &Sub = Def->getOperand(I);
463  if (!Sub.isReg() || Sub.getSubReg())
464  return false;
465  MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg());
466  while (SubDef && !SubDef->isMoveImmediate() &&
467  !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef))
468  SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg());
469  if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm())
470  return false;
471  Op = &SubDef->getOperand(1);
472  auto SubImm = Op->getImm();
473  if (I == 1) {
474  if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy))
475  return false;
476 
477  Imm = SubImm;
478  continue;
479  }
480  if (Imm != SubImm)
481  return false; // Can only fold splat constants
482  }
483 
484  FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
485  return true;
486 }
487 
488 void SIFoldOperands::foldOperand(
489  MachineOperand &OpToFold,
491  int UseOpIdx,
493  SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
494  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
495 
496  if (!isUseSafeToFold(TII, *UseMI, UseOp))
497  return;
498 
499  // FIXME: Fold operands with subregs.
500  if (UseOp.isReg() && OpToFold.isReg()) {
501  if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
502  return;
503 
504  // Don't fold subregister extracts into tied operands, only if it is a full
505  // copy since a subregister use tied to a full register def doesn't really
506  // make sense. e.g. don't fold:
507  //
508  // %1 = COPY %0:sub1
509  // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
510  //
511  // into
512  // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
513  if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
514  return;
515  }
516 
517  // Special case for REG_SEQUENCE: We can't fold literals into
518  // REG_SEQUENCE instructions, so we have to fold them into the
519  // uses of REG_SEQUENCE.
520  if (UseMI->isRegSequence()) {
521  unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
522  unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
523 
526  RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
527  RSUse != RSE; RSUse = Next) {
528  Next = std::next(RSUse);
529 
530  MachineInstr *RSUseMI = RSUse->getParent();
531 
532  if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
533  RSUse.getOperandNo(), FoldList))
534  continue;
535 
536  if (RSUse->getSubReg() != RegSeqDstSubReg)
537  continue;
538 
539  foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
540  CopiesToReplace);
541  }
542 
543  return;
544  }
545 
546  if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList))
547  return;
548 
549  if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
550  // Sanity check that this is a stack access.
551  // FIXME: Should probably use stack pseudos before frame lowering.
552  MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
553  if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
554  SOff->getReg() != MFI->getStackPtrOffsetReg()))
555  return;
556 
557  if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
558  MFI->getScratchRSrcReg())
559  return;
560 
561  // A frame index will resolve to a positive constant, so it should always be
562  // safe to fold the addressing mode, even pre-GFX9.
563  UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
564  SOff->setReg(MFI->getStackPtrOffsetReg());
565  return;
566  }
567 
568  bool FoldingImmLike =
569  OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
570 
571  if (FoldingImmLike && UseMI->isCopy()) {
572  unsigned DestReg = UseMI->getOperand(0).getReg();
573  const TargetRegisterClass *DestRC
575  MRI->getRegClass(DestReg) :
576  TRI->getPhysRegClass(DestReg);
577 
578  unsigned SrcReg = UseMI->getOperand(1).getReg();
581  const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
582  if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
586  Use = MRI->use_begin(DestReg), E = MRI->use_end();
587  Use != E; Use = NextUse) {
588  NextUse = std::next(Use);
589  FoldCandidate FC = FoldCandidate(Use->getParent(),
590  Use.getOperandNo(), &UseMI->getOperand(1));
591  CopyUses.push_back(FC);
592  }
593  for (auto & F : CopyUses) {
594  foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
595  FoldList, CopiesToReplace);
596  }
597  }
598  }
599 
600  if (DestRC == &AMDGPU::AGPR_32RegClass &&
601  TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
602  UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
603  UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
604  CopiesToReplace.push_back(UseMI);
605  return;
606  }
607 
608  // In order to fold immediates into copies, we need to change the
609  // copy to a MOV.
610 
611  unsigned MovOp = TII->getMovOpcode(DestRC);
612  if (MovOp == AMDGPU::COPY)
613  return;
614 
615  UseMI->setDesc(TII->get(MovOp));
616  CopiesToReplace.push_back(UseMI);
617  } else {
618  if (UseMI->isCopy() && OpToFold.isReg() &&
620  TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&
621  TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&
622  !UseMI->getOperand(1).getSubReg()) {
623  unsigned Size = TII->getOpSize(*UseMI, 1);
624  UseMI->getOperand(1).setReg(OpToFold.getReg());
625  UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
626  UseMI->getOperand(1).setIsKill(false);
627  CopiesToReplace.push_back(UseMI);
628  OpToFold.setIsKill(false);
629  if (Size != 4)
630  return;
631  if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
632  TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))
633  UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
634  else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
635  TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
636  UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32));
637  return;
638  }
639 
640  unsigned UseOpc = UseMI->getOpcode();
641  if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
642  (UseOpc == AMDGPU::V_READLANE_B32 &&
643  (int)UseOpIdx ==
644  AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
645  // %vgpr = V_MOV_B32 imm
646  // %sgpr = V_READFIRSTLANE_B32 %vgpr
647  // =>
648  // %sgpr = S_MOV_B32 imm
649  if (FoldingImmLike) {
651  UseMI->getOperand(UseOpIdx).getReg(),
652  *OpToFold.getParent(),
653  *UseMI))
654  return;
655 
656  UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
657 
658  // FIXME: ChangeToImmediate should clear subreg
659  UseMI->getOperand(1).setSubReg(0);
660  if (OpToFold.isImm())
661  UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
662  else
663  UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
664  UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
665  return;
666  }
667 
668  if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
670  UseMI->getOperand(UseOpIdx).getReg(),
671  *OpToFold.getParent(),
672  *UseMI))
673  return;
674 
675  // %vgpr = COPY %sgpr0
676  // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
677  // =>
678  // %sgpr1 = COPY %sgpr0
679  UseMI->setDesc(TII->get(AMDGPU::COPY));
680  UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
681  return;
682  }
683  }
684 
685  const MCInstrDesc &UseDesc = UseMI->getDesc();
686 
687  // Don't fold into target independent nodes. Target independent opcodes
688  // don't have defined register classes.
689  if (UseDesc.isVariadic() ||
690  UseOp.isImplicit() ||
691  UseDesc.OpInfo[UseOpIdx].RegClass == -1)
692  return;
693  }
694 
695  if (!FoldingImmLike) {
696  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
697 
698  // FIXME: We could try to change the instruction from 64-bit to 32-bit
699  // to enable more folding opportunites. The shrink operands pass
700  // already does this.
701  return;
702  }
703 
704 
705  const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
706  const TargetRegisterClass *FoldRC =
707  TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
708 
709  // Split 64-bit constants into 32-bits for folding.
710  if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
711  unsigned UseReg = UseOp.getReg();
712  const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
713 
714  if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
715  return;
716 
717  APInt Imm(64, OpToFold.getImm());
718  if (UseOp.getSubReg() == AMDGPU::sub0) {
719  Imm = Imm.getLoBits(32);
720  } else {
721  assert(UseOp.getSubReg() == AMDGPU::sub1);
722  Imm = Imm.getHiBits(32);
723  }
724 
725  MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
726  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
727  return;
728  }
729 
730 
731 
732  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
733 }
734 
735 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
736  uint32_t LHS, uint32_t RHS) {
737  switch (Opcode) {
738  case AMDGPU::V_AND_B32_e64:
739  case AMDGPU::V_AND_B32_e32:
740  case AMDGPU::S_AND_B32:
741  Result = LHS & RHS;
742  return true;
743  case AMDGPU::V_OR_B32_e64:
744  case AMDGPU::V_OR_B32_e32:
745  case AMDGPU::S_OR_B32:
746  Result = LHS | RHS;
747  return true;
748  case AMDGPU::V_XOR_B32_e64:
749  case AMDGPU::V_XOR_B32_e32:
750  case AMDGPU::S_XOR_B32:
751  Result = LHS ^ RHS;
752  return true;
753  case AMDGPU::V_LSHL_B32_e64:
754  case AMDGPU::V_LSHL_B32_e32:
755  case AMDGPU::S_LSHL_B32:
756  // The instruction ignores the high bits for out of bounds shifts.
757  Result = LHS << (RHS & 31);
758  return true;
759  case AMDGPU::V_LSHLREV_B32_e64:
760  case AMDGPU::V_LSHLREV_B32_e32:
761  Result = RHS << (LHS & 31);
762  return true;
763  case AMDGPU::V_LSHR_B32_e64:
764  case AMDGPU::V_LSHR_B32_e32:
765  case AMDGPU::S_LSHR_B32:
766  Result = LHS >> (RHS & 31);
767  return true;
768  case AMDGPU::V_LSHRREV_B32_e64:
769  case AMDGPU::V_LSHRREV_B32_e32:
770  Result = RHS >> (LHS & 31);
771  return true;
772  case AMDGPU::V_ASHR_I32_e64:
773  case AMDGPU::V_ASHR_I32_e32:
774  case AMDGPU::S_ASHR_I32:
775  Result = static_cast<int32_t>(LHS) >> (RHS & 31);
776  return true;
777  case AMDGPU::V_ASHRREV_I32_e64:
778  case AMDGPU::V_ASHRREV_I32_e32:
779  Result = static_cast<int32_t>(RHS) >> (LHS & 31);
780  return true;
781  default:
782  return false;
783  }
784 }
785 
786 static unsigned getMovOpc(bool IsScalar) {
787  return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
788 }
789 
790 /// Remove any leftover implicit operands from mutating the instruction. e.g.
791 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
792 /// anymore.
794  const MCInstrDesc &Desc = MI.getDesc();
795  unsigned NumOps = Desc.getNumOperands() +
796  Desc.getNumImplicitUses() +
797  Desc.getNumImplicitDefs();
798 
799  for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
800  MI.RemoveOperand(I);
801 }
802 
803 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
804  MI.setDesc(NewDesc);
806 }
807 
809  MachineOperand &Op) {
810  if (Op.isReg()) {
811  // If this has a subregister, it obviously is a register source.
812  if (Op.getSubReg() != AMDGPU::NoSubRegister ||
814  return &Op;
815 
816  MachineInstr *Def = MRI.getVRegDef(Op.getReg());
817  if (Def && Def->isMoveImmediate()) {
818  MachineOperand &ImmSrc = Def->getOperand(1);
819  if (ImmSrc.isImm())
820  return &ImmSrc;
821  }
822  }
823 
824  return &Op;
825 }
826 
827 // Try to simplify operations with a constant that may appear after instruction
828 // selection.
829 // TODO: See if a frame index with a fixed offset can fold.
831  const SIInstrInfo *TII,
832  MachineInstr *MI,
833  MachineOperand *ImmOp) {
834  unsigned Opc = MI->getOpcode();
835  if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
836  Opc == AMDGPU::S_NOT_B32) {
837  MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
838  mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
839  return true;
840  }
841 
842  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
843  if (Src1Idx == -1)
844  return false;
845 
846  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
847  MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
848  MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
849 
850  if (!Src0->isImm() && !Src1->isImm())
851  return false;
852 
853  if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) {
854  if (Src0->isImm() && Src0->getImm() == 0) {
855  // v_lshl_or_b32 0, X, Y -> copy Y
856  // v_lshl_or_b32 0, X, K -> v_mov_b32 K
857  bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
858  MI->RemoveOperand(Src1Idx);
859  MI->RemoveOperand(Src0Idx);
860 
861  MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32));
862  return true;
863  }
864  }
865 
866  // and k0, k1 -> v_mov_b32 (k0 & k1)
867  // or k0, k1 -> v_mov_b32 (k0 | k1)
868  // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
869  if (Src0->isImm() && Src1->isImm()) {
870  int32_t NewImm;
871  if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
872  return false;
873 
874  const SIRegisterInfo &TRI = TII->getRegisterInfo();
875  bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
876 
877  // Be careful to change the right operand, src0 may belong to a different
878  // instruction.
879  MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
880  MI->RemoveOperand(Src1Idx);
881  mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
882  return true;
883  }
884 
885  if (!MI->isCommutable())
886  return false;
887 
888  if (Src0->isImm() && !Src1->isImm()) {
889  std::swap(Src0, Src1);
890  std::swap(Src0Idx, Src1Idx);
891  }
892 
893  int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
894  if (Opc == AMDGPU::V_OR_B32_e64 ||
895  Opc == AMDGPU::V_OR_B32_e32 ||
896  Opc == AMDGPU::S_OR_B32) {
897  if (Src1Val == 0) {
898  // y = or x, 0 => y = copy x
899  MI->RemoveOperand(Src1Idx);
900  mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
901  } else if (Src1Val == -1) {
902  // y = or x, -1 => y = v_mov_b32 -1
903  MI->RemoveOperand(Src1Idx);
904  mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
905  } else
906  return false;
907 
908  return true;
909  }
910 
911  if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
912  MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
913  MI->getOpcode() == AMDGPU::S_AND_B32) {
914  if (Src1Val == 0) {
915  // y = and x, 0 => y = v_mov_b32 0
916  MI->RemoveOperand(Src0Idx);
917  mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
918  } else if (Src1Val == -1) {
919  // y = and x, -1 => y = copy x
920  MI->RemoveOperand(Src1Idx);
921  mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
923  } else
924  return false;
925 
926  return true;
927  }
928 
929  if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
930  MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
931  MI->getOpcode() == AMDGPU::S_XOR_B32) {
932  if (Src1Val == 0) {
933  // y = xor x, 0 => y = copy x
934  MI->RemoveOperand(Src1Idx);
935  mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
936  return true;
937  }
938  }
939 
940  return false;
941 }
942 
943 // Try to fold an instruction into a simpler one
944 static bool tryFoldInst(const SIInstrInfo *TII,
945  MachineInstr *MI) {
946  unsigned Opc = MI->getOpcode();
947 
948  if (Opc == AMDGPU::V_CNDMASK_B32_e32 ||
949  Opc == AMDGPU::V_CNDMASK_B32_e64 ||
950  Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
951  const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
952  const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
953  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
954  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
955  if (Src1->isIdenticalTo(*Src0) &&
956  (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) &&
957  (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) {
958  LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
959  auto &NewDesc =
960  TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
961  int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
962  if (Src2Idx != -1)
963  MI->RemoveOperand(Src2Idx);
964  MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
965  if (Src1ModIdx != -1)
966  MI->RemoveOperand(Src1ModIdx);
967  if (Src0ModIdx != -1)
968  MI->RemoveOperand(Src0ModIdx);
969  mutateCopyOp(*MI, NewDesc);
970  LLVM_DEBUG(dbgs() << *MI << '\n');
971  return true;
972  }
973  }
974 
975  return false;
976 }
977 
978 void SIFoldOperands::foldInstOperand(MachineInstr &MI,
979  MachineOperand &OpToFold) const {
980  // We need mutate the operands of new mov instructions to add implicit
981  // uses of EXEC, but adding them invalidates the use_iterator, so defer
982  // this.
983  SmallVector<MachineInstr *, 4> CopiesToReplace;
985  MachineOperand &Dst = MI.getOperand(0);
986 
987  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
988  if (FoldingImm) {
989  unsigned NumLiteralUses = 0;
990  MachineOperand *NonInlineUse = nullptr;
991  int NonInlineUseOpNo = -1;
992 
995  Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
996  Use != E; Use = NextUse) {
997  NextUse = std::next(Use);
998  MachineInstr *UseMI = Use->getParent();
999  unsigned OpNo = Use.getOperandNo();
1000 
1001  // Folding the immediate may reveal operations that can be constant
1002  // folded or replaced with a copy. This can happen for example after
1003  // frame indices are lowered to constants or from splitting 64-bit
1004  // constants.
1005  //
1006  // We may also encounter cases where one or both operands are
1007  // immediates materialized into a register, which would ordinarily not
1008  // be folded due to multiple uses or operand constraints.
1009 
1010  if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
1011  LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
1012 
1013  // Some constant folding cases change the same immediate's use to a new
1014  // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
1015  // again. The same constant folded instruction could also have a second
1016  // use operand.
1017  NextUse = MRI->use_begin(Dst.getReg());
1018  FoldList.clear();
1019  continue;
1020  }
1021 
1022  // Try to fold any inline immediate uses, and then only fold other
1023  // constants if they have one use.
1024  //
1025  // The legality of the inline immediate must be checked based on the use
1026  // operand, not the defining instruction, because 32-bit instructions
1027  // with 32-bit inline immediate sources may be used to materialize
1028  // constants used in 16-bit operands.
1029  //
1030  // e.g. it is unsafe to fold:
1031  // s_mov_b32 s0, 1.0 // materializes 0x3f800000
1032  // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
1033 
1034  // Folding immediates with more than one use will increase program size.
1035  // FIXME: This will also reduce register usage, which may be better
1036  // in some cases. A better heuristic is needed.
1037  if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
1038  foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
1039  } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
1040  foldOperand(OpToFold, UseMI, OpNo, FoldList,
1041  CopiesToReplace);
1042  } else {
1043  if (++NumLiteralUses == 1) {
1044  NonInlineUse = &*Use;
1045  NonInlineUseOpNo = OpNo;
1046  }
1047  }
1048  }
1049 
1050  if (NumLiteralUses == 1) {
1051  MachineInstr *UseMI = NonInlineUse->getParent();
1052  foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
1053  }
1054  } else {
1055  // Folding register.
1058  Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
1059  Use != E; ++Use) {
1060  UsesToProcess.push_back(Use);
1061  }
1062  for (auto U : UsesToProcess) {
1063  MachineInstr *UseMI = U->getParent();
1064 
1065  foldOperand(OpToFold, UseMI, U.getOperandNo(),
1066  FoldList, CopiesToReplace);
1067  }
1068  }
1069 
1070  MachineFunction *MF = MI.getParent()->getParent();
1071  // Make sure we add EXEC uses to any new v_mov instructions created.
1072  for (MachineInstr *Copy : CopiesToReplace)
1073  Copy->addImplicitDefUseOperands(*MF);
1074 
1075  for (FoldCandidate &Fold : FoldList) {
1076  if (updateOperand(Fold, *TII, *TRI, *ST)) {
1077  // Clear kill flags.
1078  if (Fold.isReg()) {
1079  assert(Fold.OpToFold && Fold.OpToFold->isReg());
1080  // FIXME: Probably shouldn't bother trying to fold if not an
1081  // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1082  // copies.
1083  MRI->clearKillFlags(Fold.OpToFold->getReg());
1084  }
1085  LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1086  << static_cast<int>(Fold.UseOpNo) << " of "
1087  << *Fold.UseMI << '\n');
1088  tryFoldInst(TII, Fold.UseMI);
1089  } else if (Fold.isCommuted()) {
1090  // Restoring instruction's original operand order if fold has failed.
1091  TII->commuteInstruction(*Fold.UseMI, false);
1092  }
1093  }
1094 }
1095 
1096 // Clamp patterns are canonically selected to v_max_* instructions, so only
1097 // handle them.
1098 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1099  unsigned Op = MI.getOpcode();
1100  switch (Op) {
1101  case AMDGPU::V_MAX_F32_e64:
1102  case AMDGPU::V_MAX_F16_e64:
1103  case AMDGPU::V_MAX_F64:
1104  case AMDGPU::V_PK_MAX_F16: {
1105  if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1106  return nullptr;
1107 
1108  // Make sure sources are identical.
1109  const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1110  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1111  if (!Src0->isReg() || !Src1->isReg() ||
1112  Src0->getReg() != Src1->getReg() ||
1113  Src0->getSubReg() != Src1->getSubReg() ||
1114  Src0->getSubReg() != AMDGPU::NoSubRegister)
1115  return nullptr;
1116 
1117  // Can't fold up if we have modifiers.
1118  if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1119  return nullptr;
1120 
1121  unsigned Src0Mods
1122  = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1123  unsigned Src1Mods
1124  = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1125 
1126  // Having a 0 op_sel_hi would require swizzling the output in the source
1127  // instruction, which we can't do.
1128  unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1129  : 0u;
1130  if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1131  return nullptr;
1132  return Src0;
1133  }
1134  default:
1135  return nullptr;
1136  }
1137 }
1138 
1139 // We obviously have multiple uses in a clamp since the register is used twice
1140 // in the same instruction.
1141 static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
1142  int Count = 0;
1143  for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
1144  I != E; ++I) {
1145  if (++Count > 1)
1146  return false;
1147  }
1148 
1149  return true;
1150 }
1151 
1152 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1153 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1154  const MachineOperand *ClampSrc = isClamp(MI);
1155  if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
1156  return false;
1157 
1158  MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1159 
1160  // The type of clamp must be compatible.
1161  if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1162  return false;
1163 
1164  MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1165  if (!DefClamp)
1166  return false;
1167 
1168  LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
1169  << '\n');
1170 
1171  // Clamp is applied after omod, so it is OK if omod is set.
1172  DefClamp->setImm(1);
1173  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1174  MI.eraseFromParent();
1175  return true;
1176 }
1177 
1178 static int getOModValue(unsigned Opc, int64_t Val) {
1179  switch (Opc) {
1180  case AMDGPU::V_MUL_F32_e64: {
1181  switch (static_cast<uint32_t>(Val)) {
1182  case 0x3f000000: // 0.5
1183  return SIOutMods::DIV2;
1184  case 0x40000000: // 2.0
1185  return SIOutMods::MUL2;
1186  case 0x40800000: // 4.0
1187  return SIOutMods::MUL4;
1188  default:
1189  return SIOutMods::NONE;
1190  }
1191  }
1192  case AMDGPU::V_MUL_F16_e64: {
1193  switch (static_cast<uint16_t>(Val)) {
1194  case 0x3800: // 0.5
1195  return SIOutMods::DIV2;
1196  case 0x4000: // 2.0
1197  return SIOutMods::MUL2;
1198  case 0x4400: // 4.0
1199  return SIOutMods::MUL4;
1200  default:
1201  return SIOutMods::NONE;
1202  }
1203  }
1204  default:
1205  llvm_unreachable("invalid mul opcode");
1206  }
1207 }
1208 
1209 // FIXME: Does this really not support denormals with f16?
1210 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1211 // handled, so will anything other than that break?
1212 std::pair<const MachineOperand *, int>
1213 SIFoldOperands::isOMod(const MachineInstr &MI) const {
1214  unsigned Op = MI.getOpcode();
1215  switch (Op) {
1216  case AMDGPU::V_MUL_F32_e64:
1217  case AMDGPU::V_MUL_F16_e64: {
1218  // If output denormals are enabled, omod is ignored.
1219  if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
1220  (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
1221  return std::make_pair(nullptr, SIOutMods::NONE);
1222 
1223  const MachineOperand *RegOp = nullptr;
1224  const MachineOperand *ImmOp = nullptr;
1225  const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1226  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1227  if (Src0->isImm()) {
1228  ImmOp = Src0;
1229  RegOp = Src1;
1230  } else if (Src1->isImm()) {
1231  ImmOp = Src1;
1232  RegOp = Src0;
1233  } else
1234  return std::make_pair(nullptr, SIOutMods::NONE);
1235 
1236  int OMod = getOModValue(Op, ImmOp->getImm());
1237  if (OMod == SIOutMods::NONE ||
1238  TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1239  TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1240  TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1241  TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1242  return std::make_pair(nullptr, SIOutMods::NONE);
1243 
1244  return std::make_pair(RegOp, OMod);
1245  }
1246  case AMDGPU::V_ADD_F32_e64:
1247  case AMDGPU::V_ADD_F16_e64: {
1248  // If output denormals are enabled, omod is ignored.
1249  if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
1250  (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
1251  return std::make_pair(nullptr, SIOutMods::NONE);
1252 
1253  // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1254  const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1255  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1256 
1257  if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1258  Src0->getSubReg() == Src1->getSubReg() &&
1259  !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1260  !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1261  !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1262  !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1263  return std::make_pair(Src0, SIOutMods::MUL2);
1264 
1265  return std::make_pair(nullptr, SIOutMods::NONE);
1266  }
1267  default:
1268  return std::make_pair(nullptr, SIOutMods::NONE);
1269  }
1270 }
1271 
1272 // FIXME: Does this need to check IEEE bit on function?
1273 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1274  const MachineOperand *RegOp;
1275  int OMod;
1276  std::tie(RegOp, OMod) = isOMod(MI);
1277  if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1278  RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1279  !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
1280  return false;
1281 
1282  MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1283  MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1284  if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1285  return false;
1286 
1287  // Clamp is applied after omod. If the source already has clamp set, don't
1288  // fold it.
1289  if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1290  return false;
1291 
1292  LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
1293 
1294  DefOMod->setImm(OMod);
1295  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1296  MI.eraseFromParent();
1297  return true;
1298 }
1299 
1300 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
1301  if (skipFunction(MF.getFunction()))
1302  return false;
1303 
1304  MRI = &MF.getRegInfo();
1305  ST = &MF.getSubtarget<GCNSubtarget>();
1306  TII = ST->getInstrInfo();
1307  TRI = &TII->getRegisterInfo();
1308  MFI = MF.getInfo<SIMachineFunctionInfo>();
1309 
1310  // omod is ignored by hardware if IEEE bit is enabled. omod also does not
1311  // correctly handle signed zeros.
1312  //
1313  // FIXME: Also need to check strictfp
1314  bool IsIEEEMode = MFI->getMode().IEEE;
1315  bool HasNSZ = MFI->hasNoSignedZerosFPMath();
1316 
1317  for (MachineBasicBlock *MBB : depth_first(&MF)) {
1319  for (I = MBB->begin(); I != MBB->end(); I = Next) {
1320  Next = std::next(I);
1321  MachineInstr &MI = *I;
1322 
1323  tryFoldInst(TII, &MI);
1324 
1325  if (!TII->isFoldableCopy(MI)) {
1326  // TODO: Omod might be OK if there is NSZ only on the source
1327  // instruction, and not the omod multiply.
1328  if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
1329  !tryFoldOMod(MI))
1330  tryFoldClamp(MI);
1331  continue;
1332  }
1333 
1334  MachineOperand &OpToFold = MI.getOperand(1);
1335  bool FoldingImm =
1336  OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1337 
1338  // FIXME: We could also be folding things like TargetIndexes.
1339  if (!FoldingImm && !OpToFold.isReg())
1340  continue;
1341 
1342  if (OpToFold.isReg() &&
1344  continue;
1345 
1346  // Prevent folding operands backwards in the function. For example,
1347  // the COPY opcode must not be replaced by 1 in this example:
1348  //
1349  // %3 = COPY %vgpr0; VGPR_32:%3
1350  // ...
1351  // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1352  MachineOperand &Dst = MI.getOperand(0);
1353  if (Dst.isReg() &&
1355  continue;
1356 
1357  foldInstOperand(MI, OpToFold);
1358  }
1359  }
1360  return false;
1361 }
static bool isReg(const MCInst &MI, unsigned OpNo)
unsigned getNumImplicitUses() const
Return the number of implicit uses this instruction has.
Definition: MCInstrDesc.h:532
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
bool use_nodbg_empty(unsigned RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register...
AMDGPU specific subclass of TargetSubtarget.
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
unsigned getNumImplicitDefs() const
Return the number of implicit defs this instruct has.
Definition: MCInstrDesc.h:554
This class represents lattice values for constants.
Definition: AllocatorList.h:23
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, unsigned Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before...
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isUseSafeToFold(const SIInstrInfo *TII, const MachineInstr &MI, const MachineOperand &UseMO)
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:385
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:164
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:511
void setIsUndef(bool Val=true)
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
unsigned Reg
unsigned getSubReg() const
unsigned getRegBitWidth(unsigned RCID)
Get the size in bits of a register from the register class RC.
bool isRegSequence() const
unsigned const TargetRegisterInfo * TRI
F(f)
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi)
void substVirtReg(unsigned Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
bool isInlineConstant(const APInt &Imm) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
const SIRegisterInfo & getRegisterInfo() const
Definition: SIInstrInfo.h:171
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction...
Definition: MachineInstr.h:702
AMDGPU::SIModeRegisterDefaults getMode() const
static unsigned getMovOpc(bool IsScalar)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition: APInt.cpp:518
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:211
const HexagonInstrInfo * TII
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned char TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:414
static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg)
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
static bool tryAddToFoldList(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII)
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
static MachineOperand * getImmOrMaterializedImm(MachineRegisterInfo &MRI, MachineOperand &Op)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:411
unsigned getID() const
Return the register class ID number.
static int getOModValue(unsigned Opc, int64_t Val)
MachineInstr * getVRegDef(unsigned Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:78
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:408
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:436
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:47
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI...
void ChangeToImmediate(int64_t ImmVal)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value...
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static const unsigned CommuteAnyOperandIndex
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static bool updateOperand(FoldCandidate &Fold, const SIInstrInfo &TII, const TargetRegisterInfo &TRI, const GCNSubtarget &ST)
Address of a global value.
unsigned const MachineRegisterInfo * MRI
bool isFoldableCopy(const MachineInstr &MI) const
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:235
MachineInstrBuilder & UseMI
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static Register UseReg(const MachineOperand &MO)
Register is known to be fully dead.
Represent the analysis usage information of a pass.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
auto find_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1220
static bool frameIndexMayFold(const SIInstrInfo *TII, const MachineInstr &UseMI, int OpNo, const MachineOperand &OpToFold)
bool isCopy() const
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
#define DEBUG_TYPE
void setIsKill(bool Val=true)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:33
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
static bool tryFoldInst(const SIInstrInfo *TII, MachineInstr *MI)
int64_t getImm() const
static void stripExtraCopyOperands(MachineInstr &MI)
Remove any leftover implicit operands from mutating the instruction.
MachineInstr * getUniqueVRegDef(unsigned Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
The access may modify the value stored in memory.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
Class for arbitrary precision integers.
Definition: APInt.h:69
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:256
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void initializeSIFoldOperandsPass(PassRegistry &)
char & SIFoldOperandsID
bool hasInv2PiInlineImm() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:72
FunctionPass * createSIFoldOperandsPass()
void setReg(unsigned Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
#define I(x, y, z)
Definition: MD5.cpp:58
void setSubReg(unsigned subReg)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static bool isInlineConstantIfFolded(const SIInstrInfo *TII, const MachineInstr &UseMI, unsigned OpNo, const MachineOperand &OpToFold)
uint32_t Size
Definition: Profile.cpp:46
Abstract Stack Frame Index.
constexpr bool isUInt< 16 >(uint64_t x)
Definition: MathExtras.h:345
bool isReg() const
isReg - Tests if this is a MO_Register operand.
iterator_range< df_iterator< T > > depth_first(const T &G)
static use_instr_nodbg_iterator use_instr_nodbg_end()
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
use_instr_nodbg_iterator use_instr_nodbg_begin(unsigned RegNo) const
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:420
const MCOperandInfo * OpInfo
Definition: MCInstrDesc.h:175
void ChangeToFrameIndex(int Idx)
Replace this operand with a frame index.
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
void RemoveOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with...
Register getReg() const
getReg - Returns the register number.
static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII, MachineInstr *MI, MachineOperand *ImmOp)
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:66
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:416
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
Definition: MachineInstr.h:297
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z, ..."), which produces the same result if Y and Z are exchanged.
Definition: MachineInstr.h:861
bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override
bool isImplicit() const
static bool tryToFoldACImm(const SIInstrInfo *TII, const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, SmallVectorImpl< FoldCandidate > &FoldList)
const SIRegisterInfo * getRegisterInfo() const override
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:500