LLVM  7.0.0svn
SIInstrInfo.cpp
Go to the documentation of this file.
1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// SI Implementation of TargetInstrInfo.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "SIInstrInfo.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "GCNHazardRecognizer.h"
19 #include "SIDefines.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/ADT/APInt.h"
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/StringRef.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/IR/InlineAsm.h"
50 #include "llvm/IR/LLVMContext.h"
51 #include "llvm/MC/MCInstrDesc.h"
52 #include "llvm/Support/Casting.h"
54 #include "llvm/Support/Compiler.h"
59 #include <cassert>
60 #include <cstdint>
61 #include <iterator>
62 #include <utility>
63 
64 using namespace llvm;
65 
66 // Must be at least 4 to be able to branch over minimum unconditional branch
67 // code. This is only for making it possible to write reasonably small tests for
68 // long branches.
69 static cl::opt<unsigned>
70 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
71  cl::desc("Restrict range of branch instructions (DEBUG)"));
72 
74  : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
75 
76 //===----------------------------------------------------------------------===//
77 // TargetInstrInfo callbacks
78 //===----------------------------------------------------------------------===//
79 
80 static unsigned getNumOperandsNoGlue(SDNode *Node) {
81  unsigned N = Node->getNumOperands();
82  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
83  --N;
84  return N;
85 }
86 
88  SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
89  assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
90  return LastOp;
91 }
92 
93 /// Returns true if both nodes have the same value for the given
94 /// operand \p Op, or if both nodes do not have this operand.
95 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
96  unsigned Opc0 = N0->getMachineOpcode();
97  unsigned Opc1 = N1->getMachineOpcode();
98 
99  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
100  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
101 
102  if (Op0Idx == -1 && Op1Idx == -1)
103  return true;
104 
105 
106  if ((Op0Idx == -1 && Op1Idx != -1) ||
107  (Op1Idx == -1 && Op0Idx != -1))
108  return false;
109 
110  // getNamedOperandIdx returns the index for the MachineInstr's operands,
111  // which includes the result as the first operand. We are indexing into the
112  // MachineSDNode's operands, so we need to skip the result operand to get
113  // the real index.
114  --Op0Idx;
115  --Op1Idx;
116 
117  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
118 }
119 
121  AliasAnalysis *AA) const {
122  // TODO: The generic check fails for VALU instructions that should be
123  // rematerializable due to implicit reads of exec. We really want all of the
124  // generic logic for this except for this.
125  switch (MI.getOpcode()) {
126  case AMDGPU::V_MOV_B32_e32:
127  case AMDGPU::V_MOV_B32_e64:
128  case AMDGPU::V_MOV_B64_PSEUDO:
129  return true;
130  default:
131  return false;
132  }
133 }
134 
136  int64_t &Offset0,
137  int64_t &Offset1) const {
138  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
139  return false;
140 
141  unsigned Opc0 = Load0->getMachineOpcode();
142  unsigned Opc1 = Load1->getMachineOpcode();
143 
144  // Make sure both are actually loads.
145  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
146  return false;
147 
148  if (isDS(Opc0) && isDS(Opc1)) {
149 
150  // FIXME: Handle this case:
151  if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
152  return false;
153 
154  // Check base reg.
155  if (Load0->getOperand(1) != Load1->getOperand(1))
156  return false;
157 
158  // Check chain.
159  if (findChainOperand(Load0) != findChainOperand(Load1))
160  return false;
161 
162  // Skip read2 / write2 variants for simplicity.
163  // TODO: We should report true if the used offsets are adjacent (excluded
164  // st64 versions).
165  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
166  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
167  return false;
168 
169  Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
170  Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
171  return true;
172  }
173 
174  if (isSMRD(Opc0) && isSMRD(Opc1)) {
175  // Skip time and cache invalidation instructions.
176  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
177  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
178  return false;
179 
181 
182  // Check base reg.
183  if (Load0->getOperand(0) != Load1->getOperand(0))
184  return false;
185 
186  const ConstantSDNode *Load0Offset =
187  dyn_cast<ConstantSDNode>(Load0->getOperand(1));
188  const ConstantSDNode *Load1Offset =
189  dyn_cast<ConstantSDNode>(Load1->getOperand(1));
190 
191  if (!Load0Offset || !Load1Offset)
192  return false;
193 
194  // Check chain.
195  if (findChainOperand(Load0) != findChainOperand(Load1))
196  return false;
197 
198  Offset0 = Load0Offset->getZExtValue();
199  Offset1 = Load1Offset->getZExtValue();
200  return true;
201  }
202 
203  // MUBUF and MTBUF can access the same addresses.
204  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
205 
206  // MUBUF and MTBUF have vaddr at different indices.
207  if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
208  findChainOperand(Load0) != findChainOperand(Load1) ||
209  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
210  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
211  return false;
212 
213  int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
214  int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
215 
216  if (OffIdx0 == -1 || OffIdx1 == -1)
217  return false;
218 
219  // getNamedOperandIdx returns the index for MachineInstrs. Since they
220  // inlcude the output in the operand list, but SDNodes don't, we need to
221  // subtract the index by one.
222  --OffIdx0;
223  --OffIdx1;
224 
225  SDValue Off0 = Load0->getOperand(OffIdx0);
226  SDValue Off1 = Load1->getOperand(OffIdx1);
227 
228  // The offset might be a FrameIndexSDNode.
229  if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
230  return false;
231 
232  Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
233  Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
234  return true;
235  }
236 
237  return false;
238 }
239 
240 static bool isStride64(unsigned Opc) {
241  switch (Opc) {
242  case AMDGPU::DS_READ2ST64_B32:
243  case AMDGPU::DS_READ2ST64_B64:
244  case AMDGPU::DS_WRITE2ST64_B32:
245  case AMDGPU::DS_WRITE2ST64_B64:
246  return true;
247  default:
248  return false;
249  }
250 }
251 
252 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
253  int64_t &Offset,
254  const TargetRegisterInfo *TRI) const {
255  unsigned Opc = LdSt.getOpcode();
256 
257  if (isDS(LdSt)) {
258  const MachineOperand *OffsetImm =
259  getNamedOperand(LdSt, AMDGPU::OpName::offset);
260  if (OffsetImm) {
261  // Normal, single offset LDS instruction.
262  const MachineOperand *AddrReg =
263  getNamedOperand(LdSt, AMDGPU::OpName::addr);
264 
265  BaseReg = AddrReg->getReg();
266  Offset = OffsetImm->getImm();
267  return true;
268  }
269 
270  // The 2 offset instructions use offset0 and offset1 instead. We can treat
271  // these as a load with a single offset if the 2 offsets are consecutive. We
272  // will use this for some partially aligned loads.
273  const MachineOperand *Offset0Imm =
274  getNamedOperand(LdSt, AMDGPU::OpName::offset0);
275  const MachineOperand *Offset1Imm =
276  getNamedOperand(LdSt, AMDGPU::OpName::offset1);
277 
278  uint8_t Offset0 = Offset0Imm->getImm();
279  uint8_t Offset1 = Offset1Imm->getImm();
280 
281  if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
282  // Each of these offsets is in element sized units, so we need to convert
283  // to bytes of the individual reads.
284 
285  unsigned EltSize;
286  if (LdSt.mayLoad())
287  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
288  else {
289  assert(LdSt.mayStore());
290  int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
291  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
292  }
293 
294  if (isStride64(Opc))
295  EltSize *= 64;
296 
297  const MachineOperand *AddrReg =
298  getNamedOperand(LdSt, AMDGPU::OpName::addr);
299  BaseReg = AddrReg->getReg();
300  Offset = EltSize * Offset0;
301  return true;
302  }
303 
304  return false;
305  }
306 
307  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
308  const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
309  if (SOffset && SOffset->isReg())
310  return false;
311 
312  const MachineOperand *AddrReg =
313  getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
314  if (!AddrReg)
315  return false;
316 
317  const MachineOperand *OffsetImm =
318  getNamedOperand(LdSt, AMDGPU::OpName::offset);
319  BaseReg = AddrReg->getReg();
320  Offset = OffsetImm->getImm();
321 
322  if (SOffset) // soffset can be an inline immediate.
323  Offset += SOffset->getImm();
324 
325  return true;
326  }
327 
328  if (isSMRD(LdSt)) {
329  const MachineOperand *OffsetImm =
330  getNamedOperand(LdSt, AMDGPU::OpName::offset);
331  if (!OffsetImm)
332  return false;
333 
334  const MachineOperand *SBaseReg =
335  getNamedOperand(LdSt, AMDGPU::OpName::sbase);
336  BaseReg = SBaseReg->getReg();
337  Offset = OffsetImm->getImm();
338  return true;
339  }
340 
341  if (isFLAT(LdSt)) {
342  const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
343  if (VAddr) {
344  // Can't analyze 2 offsets.
345  if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
346  return false;
347 
348  BaseReg = VAddr->getReg();
349  } else {
350  // scratch instructions have either vaddr or saddr.
351  BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
352  }
353 
354  Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
355  return true;
356  }
357 
358  return false;
359 }
360 
361 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
362  const MachineInstr &MI2, unsigned BaseReg2) {
363  if (BaseReg1 == BaseReg2)
364  return true;
365 
366  if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
367  return false;
368 
369  auto MO1 = *MI1.memoperands_begin();
370  auto MO2 = *MI2.memoperands_begin();
371  if (MO1->getAddrSpace() != MO2->getAddrSpace())
372  return false;
373 
374  auto Base1 = MO1->getValue();
375  auto Base2 = MO2->getValue();
376  if (!Base1 || !Base2)
377  return false;
378  const MachineFunction &MF = *MI1.getParent()->getParent();
379  const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
380  Base1 = GetUnderlyingObject(Base1, DL);
381  Base2 = GetUnderlyingObject(Base1, DL);
382 
383  if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
384  return false;
385 
386  return Base1 == Base2;
387 }
388 
390  unsigned BaseReg1,
391  MachineInstr &SecondLdSt,
392  unsigned BaseReg2,
393  unsigned NumLoads) const {
394  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
395  return false;
396 
397  const MachineOperand *FirstDst = nullptr;
398  const MachineOperand *SecondDst = nullptr;
399 
400  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
401  (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
402  (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
403  const unsigned MaxGlobalLoadCluster = 6;
404  if (NumLoads > MaxGlobalLoadCluster)
405  return false;
406 
407  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
408  if (!FirstDst)
409  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
410  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
411  if (!SecondDst)
412  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
413  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
414  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
415  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
416  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
417  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
418  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
419  }
420 
421  if (!FirstDst || !SecondDst)
422  return false;
423 
424  // Try to limit clustering based on the total number of bytes loaded
425  // rather than the number of instructions. This is done to help reduce
426  // register pressure. The method used is somewhat inexact, though,
427  // because it assumes that all loads in the cluster will load the
428  // same number of bytes as FirstLdSt.
429 
430  // The unit of this value is bytes.
431  // FIXME: This needs finer tuning.
432  unsigned LoadClusterThreshold = 16;
433 
434  const MachineRegisterInfo &MRI =
435  FirstLdSt.getParent()->getParent()->getRegInfo();
436  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
437 
438  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
439 }
440 
443  const DebugLoc &DL, unsigned DestReg,
444  unsigned SrcReg, bool KillSrc) {
445  MachineFunction *MF = MBB.getParent();
446  DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
447  "illegal SGPR to VGPR copy",
448  DL, DS_Error);
449  LLVMContext &C = MF->getFunction().getContext();
450  C.diagnose(IllegalCopy);
451 
452  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
453  .addReg(SrcReg, getKillRegState(KillSrc));
454 }
455 
458  const DebugLoc &DL, unsigned DestReg,
459  unsigned SrcReg, bool KillSrc) const {
460  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
461 
462  if (RC == &AMDGPU::VGPR_32RegClass) {
463  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
464  AMDGPU::SReg_32RegClass.contains(SrcReg));
465  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
466  .addReg(SrcReg, getKillRegState(KillSrc));
467  return;
468  }
469 
470  if (RC == &AMDGPU::SReg_32_XM0RegClass ||
471  RC == &AMDGPU::SReg_32RegClass) {
472  if (SrcReg == AMDGPU::SCC) {
473  BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
474  .addImm(-1)
475  .addImm(0);
476  return;
477  }
478 
479  if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
480  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
481  return;
482  }
483 
484  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
485  .addReg(SrcReg, getKillRegState(KillSrc));
486  return;
487  }
488 
489  if (RC == &AMDGPU::SReg_64RegClass) {
490  if (DestReg == AMDGPU::VCC) {
491  if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
492  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
493  .addReg(SrcReg, getKillRegState(KillSrc));
494  } else {
495  // FIXME: Hack until VReg_1 removed.
496  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
497  BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
498  .addImm(0)
499  .addReg(SrcReg, getKillRegState(KillSrc));
500  }
501 
502  return;
503  }
504 
505  if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
506  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
507  return;
508  }
509 
510  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
511  .addReg(SrcReg, getKillRegState(KillSrc));
512  return;
513  }
514 
515  if (DestReg == AMDGPU::SCC) {
516  assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
517  BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
518  .addReg(SrcReg, getKillRegState(KillSrc))
519  .addImm(0);
520  return;
521  }
522 
523  unsigned EltSize = 4;
524  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
525  if (RI.isSGPRClass(RC)) {
526  if (RI.getRegSizeInBits(*RC) > 32) {
527  Opcode = AMDGPU::S_MOV_B64;
528  EltSize = 8;
529  } else {
530  Opcode = AMDGPU::S_MOV_B32;
531  EltSize = 4;
532  }
533 
534  if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
535  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
536  return;
537  }
538  }
539 
540  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
541  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
542 
543  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
544  unsigned SubIdx;
545  if (Forward)
546  SubIdx = SubIndices[Idx];
547  else
548  SubIdx = SubIndices[SubIndices.size() - Idx - 1];
549 
550  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
551  get(Opcode), RI.getSubReg(DestReg, SubIdx));
552 
553  Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
554 
555  if (Idx == 0)
556  Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
557 
558  bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
559  Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
560  }
561 }
562 
563 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
564  int NewOpc;
565 
566  // Try to map original to commuted opcode
567  NewOpc = AMDGPU::getCommuteRev(Opcode);
568  if (NewOpc != -1)
569  // Check if the commuted (REV) opcode exists on the target.
570  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
571 
572  // Try to map commuted to original opcode
573  NewOpc = AMDGPU::getCommuteOrig(Opcode);
574  if (NewOpc != -1)
575  // Check if the original (non-REV) opcode exists on the target.
576  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
577 
578  return Opcode;
579 }
580 
583  const DebugLoc &DL, unsigned DestReg,
584  int64_t Value) const {
586  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
587  if (RegClass == &AMDGPU::SReg_32RegClass ||
588  RegClass == &AMDGPU::SGPR_32RegClass ||
589  RegClass == &AMDGPU::SReg_32_XM0RegClass ||
590  RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
591  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
592  .addImm(Value);
593  return;
594  }
595 
596  if (RegClass == &AMDGPU::SReg_64RegClass ||
597  RegClass == &AMDGPU::SGPR_64RegClass ||
598  RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
599  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
600  .addImm(Value);
601  return;
602  }
603 
604  if (RegClass == &AMDGPU::VGPR_32RegClass) {
605  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
606  .addImm(Value);
607  return;
608  }
609  if (RegClass == &AMDGPU::VReg_64RegClass) {
610  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
611  .addImm(Value);
612  return;
613  }
614 
615  unsigned EltSize = 4;
616  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
617  if (RI.isSGPRClass(RegClass)) {
618  if (RI.getRegSizeInBits(*RegClass) > 32) {
619  Opcode = AMDGPU::S_MOV_B64;
620  EltSize = 8;
621  } else {
622  Opcode = AMDGPU::S_MOV_B32;
623  EltSize = 4;
624  }
625  }
626 
627  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
628  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
629  int64_t IdxValue = Idx == 0 ? Value : 0;
630 
631  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
632  get(Opcode), RI.getSubReg(DestReg, Idx));
633  Builder.addImm(IdxValue);
634  }
635 }
636 
637 const TargetRegisterClass *
639  return &AMDGPU::VGPR_32RegClass;
640 }
641 
644  const DebugLoc &DL, unsigned DstReg,
646  unsigned TrueReg,
647  unsigned FalseReg) const {
649  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
650  "Not a VGPR32 reg");
651 
652  if (Cond.size() == 1) {
653  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
654  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
655  .add(Cond[0]);
656  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
657  .addReg(FalseReg)
658  .addReg(TrueReg)
659  .addReg(SReg);
660  } else if (Cond.size() == 2) {
661  assert(Cond[0].isImm() && "Cond[0] is not an immediate");
662  switch (Cond[0].getImm()) {
663  case SIInstrInfo::SCC_TRUE: {
664  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
665  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
666  .addImm(-1)
667  .addImm(0);
668  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
669  .addReg(FalseReg)
670  .addReg(TrueReg)
671  .addReg(SReg);
672  break;
673  }
674  case SIInstrInfo::SCC_FALSE: {
675  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
676  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
677  .addImm(0)
678  .addImm(-1);
679  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
680  .addReg(FalseReg)
681  .addReg(TrueReg)
682  .addReg(SReg);
683  break;
684  }
685  case SIInstrInfo::VCCNZ: {
686  MachineOperand RegOp = Cond[1];
687  RegOp.setImplicit(false);
688  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
689  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
690  .add(RegOp);
691  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
692  .addReg(FalseReg)
693  .addReg(TrueReg)
694  .addReg(SReg);
695  break;
696  }
697  case SIInstrInfo::VCCZ: {
698  MachineOperand RegOp = Cond[1];
699  RegOp.setImplicit(false);
700  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
701  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
702  .add(RegOp);
703  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
704  .addReg(TrueReg)
705  .addReg(FalseReg)
706  .addReg(SReg);
707  break;
708  }
709  case SIInstrInfo::EXECNZ: {
710  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
711  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
712  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
713  .addImm(0);
714  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
715  .addImm(-1)
716  .addImm(0);
717  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
718  .addReg(FalseReg)
719  .addReg(TrueReg)
720  .addReg(SReg);
721  break;
722  }
723  case SIInstrInfo::EXECZ: {
724  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
725  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
726  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
727  .addImm(0);
728  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
729  .addImm(0)
730  .addImm(-1);
731  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
732  .addReg(FalseReg)
733  .addReg(TrueReg)
734  .addReg(SReg);
735  llvm_unreachable("Unhandled branch predicate EXECZ");
736  break;
737  }
738  default:
739  llvm_unreachable("invalid branch predicate");
740  }
741  } else {
742  llvm_unreachable("Can only handle Cond size 1 or 2");
743  }
744 }
745 
748  const DebugLoc &DL,
749  unsigned SrcReg, int Value) const {
751  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
752  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
753  .addImm(Value)
754  .addReg(SrcReg);
755 
756  return Reg;
757 }
758 
761  const DebugLoc &DL,
762  unsigned SrcReg, int Value) const {
764  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
765  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
766  .addImm(Value)
767  .addReg(SrcReg);
768 
769  return Reg;
770 }
771 
772 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
773 
774  if (RI.getRegSizeInBits(*DstRC) == 32) {
775  return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
776  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
777  return AMDGPU::S_MOV_B64;
778  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
779  return AMDGPU::V_MOV_B64_PSEUDO;
780  }
781  return AMDGPU::COPY;
782 }
783 
784 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
785  switch (Size) {
786  case 4:
787  return AMDGPU::SI_SPILL_S32_SAVE;
788  case 8:
789  return AMDGPU::SI_SPILL_S64_SAVE;
790  case 16:
791  return AMDGPU::SI_SPILL_S128_SAVE;
792  case 32:
793  return AMDGPU::SI_SPILL_S256_SAVE;
794  case 64:
795  return AMDGPU::SI_SPILL_S512_SAVE;
796  default:
797  llvm_unreachable("unknown register size");
798  }
799 }
800 
801 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
802  switch (Size) {
803  case 4:
804  return AMDGPU::SI_SPILL_V32_SAVE;
805  case 8:
806  return AMDGPU::SI_SPILL_V64_SAVE;
807  case 12:
808  return AMDGPU::SI_SPILL_V96_SAVE;
809  case 16:
810  return AMDGPU::SI_SPILL_V128_SAVE;
811  case 32:
812  return AMDGPU::SI_SPILL_V256_SAVE;
813  case 64:
814  return AMDGPU::SI_SPILL_V512_SAVE;
815  default:
816  llvm_unreachable("unknown register size");
817  }
818 }
819 
822  unsigned SrcReg, bool isKill,
823  int FrameIndex,
824  const TargetRegisterClass *RC,
825  const TargetRegisterInfo *TRI) const {
826  MachineFunction *MF = MBB.getParent();
828  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
829  DebugLoc DL = MBB.findDebugLoc(MI);
830 
831  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
832  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
833  MachinePointerInfo PtrInfo
834  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
835  MachineMemOperand *MMO
837  Size, Align);
838  unsigned SpillSize = TRI->getSpillSize(*RC);
839 
840  if (RI.isSGPRClass(RC)) {
841  MFI->setHasSpilledSGPRs();
842 
843  // We are only allowed to create one new instruction when spilling
844  // registers, so we need to use pseudo instruction for spilling SGPRs.
845  const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
846 
847  // The SGPR spill/restore instructions only work on number sgprs, so we need
848  // to make sure we are using the correct register class.
849  if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
851  MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
852  }
853 
854  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
855  .addReg(SrcReg, getKillRegState(isKill)) // data
856  .addFrameIndex(FrameIndex) // addr
857  .addMemOperand(MMO)
859  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
860  // Add the scratch resource registers as implicit uses because we may end up
861  // needing them, and need to ensure that the reserved registers are
862  // correctly handled.
863 
864  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
865  if (ST.hasScalarStores()) {
866  // m0 is used for offset to scalar stores if used to spill.
867  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
868  }
869 
870  return;
871  }
872 
873  if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
874  LLVMContext &Ctx = MF->getFunction().getContext();
875  Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
876  " spill register");
877  BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
878  .addReg(SrcReg);
879 
880  return;
881  }
882 
883  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
884 
885  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
886  MFI->setHasSpilledVGPRs();
887  BuildMI(MBB, MI, DL, get(Opcode))
888  .addReg(SrcReg, getKillRegState(isKill)) // data
889  .addFrameIndex(FrameIndex) // addr
890  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
891  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
892  .addImm(0) // offset
893  .addMemOperand(MMO);
894 }
895 
896 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
897  switch (Size) {
898  case 4:
899  return AMDGPU::SI_SPILL_S32_RESTORE;
900  case 8:
901  return AMDGPU::SI_SPILL_S64_RESTORE;
902  case 16:
903  return AMDGPU::SI_SPILL_S128_RESTORE;
904  case 32:
905  return AMDGPU::SI_SPILL_S256_RESTORE;
906  case 64:
907  return AMDGPU::SI_SPILL_S512_RESTORE;
908  default:
909  llvm_unreachable("unknown register size");
910  }
911 }
912 
913 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
914  switch (Size) {
915  case 4:
916  return AMDGPU::SI_SPILL_V32_RESTORE;
917  case 8:
918  return AMDGPU::SI_SPILL_V64_RESTORE;
919  case 12:
920  return AMDGPU::SI_SPILL_V96_RESTORE;
921  case 16:
922  return AMDGPU::SI_SPILL_V128_RESTORE;
923  case 32:
924  return AMDGPU::SI_SPILL_V256_RESTORE;
925  case 64:
926  return AMDGPU::SI_SPILL_V512_RESTORE;
927  default:
928  llvm_unreachable("unknown register size");
929  }
930 }
931 
934  unsigned DestReg, int FrameIndex,
935  const TargetRegisterClass *RC,
936  const TargetRegisterInfo *TRI) const {
937  MachineFunction *MF = MBB.getParent();
939  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
940  DebugLoc DL = MBB.findDebugLoc(MI);
941  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
942  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
943  unsigned SpillSize = TRI->getSpillSize(*RC);
944 
945  MachinePointerInfo PtrInfo
946  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
947 
949  PtrInfo, MachineMemOperand::MOLoad, Size, Align);
950 
951  if (RI.isSGPRClass(RC)) {
952  // FIXME: Maybe this should not include a memoperand because it will be
953  // lowered to non-memory instructions.
954  const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
955  if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
957  MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
958  }
959 
960  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
961  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
962  .addFrameIndex(FrameIndex) // addr
963  .addMemOperand(MMO)
965  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
966 
967  if (ST.hasScalarStores()) {
968  // m0 is used for offset to scalar stores if used to spill.
969  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
970  }
971 
972  return;
973  }
974 
975  if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
976  LLVMContext &Ctx = MF->getFunction().getContext();
977  Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
978  " restore register");
979  BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
980 
981  return;
982  }
983 
984  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
985 
986  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
987  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
988  .addFrameIndex(FrameIndex) // vaddr
989  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
990  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
991  .addImm(0) // offset
992  .addMemOperand(MMO);
993 }
994 
995 /// \param @Offset Offset in bytes of the FrameIndex being spilled
997  MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
998  unsigned FrameOffset, unsigned Size) const {
999  MachineFunction *MF = MBB.getParent();
1001  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
1002  DebugLoc DL = MBB.findDebugLoc(MI);
1003  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
1004  unsigned WavefrontSize = ST.getWavefrontSize();
1005 
1006  unsigned TIDReg = MFI->getTIDReg();
1007  if (!MFI->hasCalculatedTID()) {
1008  MachineBasicBlock &Entry = MBB.getParent()->front();
1009  MachineBasicBlock::iterator Insert = Entry.front();
1010  DebugLoc DL = Insert->getDebugLoc();
1011 
1012  TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
1013  *MF);
1014  if (TIDReg == AMDGPU::NoRegister)
1015  return TIDReg;
1016 
1018  WorkGroupSize > WavefrontSize) {
1019  unsigned TIDIGXReg
1021  unsigned TIDIGYReg
1023  unsigned TIDIGZReg
1025  unsigned InputPtrReg =
1027  for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
1028  if (!Entry.isLiveIn(Reg))
1029  Entry.addLiveIn(Reg);
1030  }
1031 
1032  RS->enterBasicBlock(Entry);
1033  // FIXME: Can we scavenge an SReg_64 and access the subregs?
1034  unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1035  unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1036  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
1037  .addReg(InputPtrReg)
1039  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
1040  .addReg(InputPtrReg)
1042 
1043  // NGROUPS.X * NGROUPS.Y
1044  BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
1045  .addReg(STmp1)
1046  .addReg(STmp0);
1047  // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1048  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1049  .addReg(STmp1)
1050  .addReg(TIDIGXReg);
1051  // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1052  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1053  .addReg(STmp0)
1054  .addReg(TIDIGYReg)
1055  .addReg(TIDReg);
1056  // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1057  getAddNoCarry(Entry, Insert, DL, TIDReg)
1058  .addReg(TIDReg)
1059  .addReg(TIDIGZReg);
1060  } else {
1061  // Get the wave id
1062  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1063  TIDReg)
1064  .addImm(-1)
1065  .addImm(0);
1066 
1067  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1068  TIDReg)
1069  .addImm(-1)
1070  .addReg(TIDReg);
1071  }
1072 
1073  BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1074  TIDReg)
1075  .addImm(2)
1076  .addReg(TIDReg);
1077  MFI->setTIDReg(TIDReg);
1078  }
1079 
1080  // Add FrameIndex to LDS offset
1081  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1082  getAddNoCarry(MBB, MI, DL, TmpReg)
1083  .addImm(LDSOffset)
1084  .addReg(TIDReg);
1085 
1086  return TmpReg;
1087 }
1088 
1091  int Count) const {
1092  DebugLoc DL = MBB.findDebugLoc(MI);
1093  while (Count > 0) {
1094  int Arg;
1095  if (Count >= 8)
1096  Arg = 7;
1097  else
1098  Arg = Count - 1;
1099  Count -= 8;
1100  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1101  .addImm(Arg);
1102  }
1103 }
1104 
1107  insertWaitStates(MBB, MI, 1);
1108 }
1109 
1111  auto MF = MBB.getParent();
1112  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1113 
1114  assert(Info->isEntryFunction());
1115 
1116  if (MBB.succ_empty()) {
1117  bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1118  if (HasNoTerminator)
1119  BuildMI(MBB, MBB.end(), DebugLoc(),
1120  get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
1121  }
1122 }
1123 
1125  switch (MI.getOpcode()) {
1126  default: return 1; // FIXME: Do wait states equal cycles?
1127 
1128  case AMDGPU::S_NOP:
1129  return MI.getOperand(0).getImm() + 1;
1130  }
1131 }
1132 
1134  MachineBasicBlock &MBB = *MI.getParent();
1135  DebugLoc DL = MBB.findDebugLoc(MI);
1136  switch (MI.getOpcode()) {
1137  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
1138  case AMDGPU::S_MOV_B64_term:
1139  // This is only a terminator to get the correct spill code placement during
1140  // register allocation.
1141  MI.setDesc(get(AMDGPU::S_MOV_B64));
1142  break;
1143 
1144  case AMDGPU::S_XOR_B64_term:
1145  // This is only a terminator to get the correct spill code placement during
1146  // register allocation.
1147  MI.setDesc(get(AMDGPU::S_XOR_B64));
1148  break;
1149 
1150  case AMDGPU::S_ANDN2_B64_term:
1151  // This is only a terminator to get the correct spill code placement during
1152  // register allocation.
1153  MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1154  break;
1155 
1156  case AMDGPU::V_MOV_B64_PSEUDO: {
1157  unsigned Dst = MI.getOperand(0).getReg();
1158  unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1159  unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1160 
1161  const MachineOperand &SrcOp = MI.getOperand(1);
1162  // FIXME: Will this work for 64-bit floating point immediates?
1163  assert(!SrcOp.isFPImm());
1164  if (SrcOp.isImm()) {
1165  APInt Imm(64, SrcOp.getImm());
1166  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1167  .addImm(Imm.getLoBits(32).getZExtValue())
1168  .addReg(Dst, RegState::Implicit | RegState::Define);
1169  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1170  .addImm(Imm.getHiBits(32).getZExtValue())
1171  .addReg(Dst, RegState::Implicit | RegState::Define);
1172  } else {
1173  assert(SrcOp.isReg());
1174  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1175  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1177  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1178  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1180  }
1181  MI.eraseFromParent();
1182  break;
1183  }
1184  case AMDGPU::V_SET_INACTIVE_B32: {
1185  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1186  .addReg(AMDGPU::EXEC);
1187  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1188  .add(MI.getOperand(2));
1189  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1190  .addReg(AMDGPU::EXEC);
1191  MI.eraseFromParent();
1192  break;
1193  }
1194  case AMDGPU::V_SET_INACTIVE_B64: {
1195  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1196  .addReg(AMDGPU::EXEC);
1197  MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1198  MI.getOperand(0).getReg())
1199  .add(MI.getOperand(2));
1200  expandPostRAPseudo(*Copy);
1201  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1202  .addReg(AMDGPU::EXEC);
1203  MI.eraseFromParent();
1204  break;
1205  }
1206  case AMDGPU::V_MOVRELD_B32_V1:
1207  case AMDGPU::V_MOVRELD_B32_V2:
1208  case AMDGPU::V_MOVRELD_B32_V4:
1209  case AMDGPU::V_MOVRELD_B32_V8:
1210  case AMDGPU::V_MOVRELD_B32_V16: {
1211  const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1212  unsigned VecReg = MI.getOperand(0).getReg();
1213  bool IsUndef = MI.getOperand(1).isUndef();
1214  unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1215  assert(VecReg == MI.getOperand(1).getReg());
1216 
1217  MachineInstr *MovRel =
1218  BuildMI(MBB, MI, DL, MovRelDesc)
1219  .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1220  .add(MI.getOperand(2))
1221  .addReg(VecReg, RegState::ImplicitDefine)
1222  .addReg(VecReg,
1223  RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1224 
1225  const int ImpDefIdx =
1226  MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1227  const int ImpUseIdx = ImpDefIdx + 1;
1228  MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1229 
1230  MI.eraseFromParent();
1231  break;
1232  }
1233  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1234  MachineFunction &MF = *MBB.getParent();
1235  unsigned Reg = MI.getOperand(0).getReg();
1236  unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1237  unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1238 
1239  // Create a bundle so these instructions won't be re-ordered by the
1240  // post-RA scheduler.
1241  MIBundleBuilder Bundler(MBB, MI);
1242  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1243 
1244  // Add 32-bit offset from this instruction to the start of the
1245  // constant data.
1246  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1247  .addReg(RegLo)
1248  .add(MI.getOperand(1)));
1249 
1250  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1251  .addReg(RegHi);
1253  MIB.addImm(0);
1254  else
1255  MIB.add(MI.getOperand(2));
1256 
1257  Bundler.append(MIB);
1258  finalizeBundle(MBB, Bundler.begin());
1259 
1260  MI.eraseFromParent();
1261  break;
1262  }
1263  case AMDGPU::EXIT_WWM: {
1264  // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
1265  // is exited.
1266  MI.setDesc(get(AMDGPU::S_MOV_B64));
1267  break;
1268  }
1269  case TargetOpcode::BUNDLE: {
1270  if (!MI.mayLoad())
1271  return false;
1272 
1273  // If it is a load it must be a memory clause
1275  I->isBundledWithSucc(); ++I) {
1276  I->unbundleFromSucc();
1277  for (MachineOperand &MO : I->operands())
1278  if (MO.isReg())
1279  MO.setIsInternalRead(false);
1280  }
1281 
1282  MI.eraseFromParent();
1283  break;
1284  }
1285  }
1286  return true;
1287 }
1288 
1290  MachineOperand &Src0,
1291  unsigned Src0OpName,
1292  MachineOperand &Src1,
1293  unsigned Src1OpName) const {
1294  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1295  if (!Src0Mods)
1296  return false;
1297 
1298  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1299  assert(Src1Mods &&
1300  "All commutable instructions have both src0 and src1 modifiers");
1301 
1302  int Src0ModsVal = Src0Mods->getImm();
1303  int Src1ModsVal = Src1Mods->getImm();
1304 
1305  Src1Mods->setImm(Src0ModsVal);
1306  Src0Mods->setImm(Src1ModsVal);
1307  return true;
1308 }
1309 
1311  MachineOperand &RegOp,
1312  MachineOperand &NonRegOp) {
1313  unsigned Reg = RegOp.getReg();
1314  unsigned SubReg = RegOp.getSubReg();
1315  bool IsKill = RegOp.isKill();
1316  bool IsDead = RegOp.isDead();
1317  bool IsUndef = RegOp.isUndef();
1318  bool IsDebug = RegOp.isDebug();
1319 
1320  if (NonRegOp.isImm())
1321  RegOp.ChangeToImmediate(NonRegOp.getImm());
1322  else if (NonRegOp.isFI())
1323  RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1324  else
1325  return nullptr;
1326 
1327  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1328  NonRegOp.setSubReg(SubReg);
1329 
1330  return &MI;
1331 }
1332 
1334  unsigned Src0Idx,
1335  unsigned Src1Idx) const {
1336  assert(!NewMI && "this should never be used");
1337 
1338  unsigned Opc = MI.getOpcode();
1339  int CommutedOpcode = commuteOpcode(Opc);
1340  if (CommutedOpcode == -1)
1341  return nullptr;
1342 
1343  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1344  static_cast<int>(Src0Idx) &&
1345  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1346  static_cast<int>(Src1Idx) &&
1347  "inconsistency with findCommutedOpIndices");
1348 
1349  MachineOperand &Src0 = MI.getOperand(Src0Idx);
1350  MachineOperand &Src1 = MI.getOperand(Src1Idx);
1351 
1352  MachineInstr *CommutedMI = nullptr;
1353  if (Src0.isReg() && Src1.isReg()) {
1354  if (isOperandLegal(MI, Src1Idx, &Src0)) {
1355  // Be sure to copy the source modifiers to the right place.
1356  CommutedMI
1357  = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1358  }
1359 
1360  } else if (Src0.isReg() && !Src1.isReg()) {
1361  // src0 should always be able to support any operand type, so no need to
1362  // check operand legality.
1363  CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1364  } else if (!Src0.isReg() && Src1.isReg()) {
1365  if (isOperandLegal(MI, Src1Idx, &Src0))
1366  CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1367  } else {
1368  // FIXME: Found two non registers to commute. This does happen.
1369  return nullptr;
1370  }
1371 
1372  if (CommutedMI) {
1373  swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1374  Src1, AMDGPU::OpName::src1_modifiers);
1375 
1376  CommutedMI->setDesc(get(CommutedOpcode));
1377  }
1378 
1379  return CommutedMI;
1380 }
1381 
1382 // This needs to be implemented because the source modifiers may be inserted
1383 // between the true commutable operands, and the base
1384 // TargetInstrInfo::commuteInstruction uses it.
1386  unsigned &SrcOpIdx1) const {
1387  if (!MI.isCommutable())
1388  return false;
1389 
1390  unsigned Opc = MI.getOpcode();
1391  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1392  if (Src0Idx == -1)
1393  return false;
1394 
1395  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1396  if (Src1Idx == -1)
1397  return false;
1398 
1399  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1400 }
1401 
1402 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1403  int64_t BrOffset) const {
1404  // BranchRelaxation should never have to check s_setpc_b64 because its dest
1405  // block is unanalyzable.
1406  assert(BranchOp != AMDGPU::S_SETPC_B64);
1407 
1408  // Convert to dwords.
1409  BrOffset /= 4;
1410 
1411  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1412  // from the next instruction.
1413  BrOffset -= 1;
1414 
1415  return isIntN(BranchOffsetBits, BrOffset);
1416 }
1417 
1419  const MachineInstr &MI) const {
1420  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1421  // This would be a difficult analysis to perform, but can always be legal so
1422  // there's no need to analyze it.
1423  return nullptr;
1424  }
1425 
1426  return MI.getOperand(0).getMBB();
1427 }
1428 
1430  MachineBasicBlock &DestBB,
1431  const DebugLoc &DL,
1432  int64_t BrOffset,
1433  RegScavenger *RS) const {
1434  assert(RS && "RegScavenger required for long branching");
1435  assert(MBB.empty() &&
1436  "new block should be inserted for expanding unconditional branch");
1437  assert(MBB.pred_size() == 1);
1438 
1439  MachineFunction *MF = MBB.getParent();
1440  MachineRegisterInfo &MRI = MF->getRegInfo();
1441 
1442  // FIXME: Virtual register workaround for RegScavenger not working with empty
1443  // blocks.
1444  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1445 
1446  auto I = MBB.end();
1447 
1448  // We need to compute the offset relative to the instruction immediately after
1449  // s_getpc_b64. Insert pc arithmetic code before last terminator.
1450  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1451 
1452  // TODO: Handle > 32-bit block address.
1453  if (BrOffset >= 0) {
1454  BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1455  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1456  .addReg(PCReg, 0, AMDGPU::sub0)
1458  BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1459  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1460  .addReg(PCReg, 0, AMDGPU::sub1)
1461  .addImm(0);
1462  } else {
1463  // Backwards branch.
1464  BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1465  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1466  .addReg(PCReg, 0, AMDGPU::sub0)
1468  BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1469  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1470  .addReg(PCReg, 0, AMDGPU::sub1)
1471  .addImm(0);
1472  }
1473 
1474  // Insert the indirect branch after the other terminator.
1475  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1476  .addReg(PCReg);
1477 
1478  // FIXME: If spilling is necessary, this will fail because this scavenger has
1479  // no emergency stack slots. It is non-trivial to spill in this situation,
1480  // because the restore code needs to be specially placed after the
1481  // jump. BranchRelaxation then needs to be made aware of the newly inserted
1482  // block.
1483  //
1484  // If a spill is needed for the pc register pair, we need to insert a spill
1485  // restore block right before the destination block, and insert a short branch
1486  // into the old destination block's fallthrough predecessor.
1487  // e.g.:
1488  //
1489  // s_cbranch_scc0 skip_long_branch:
1490  //
1491  // long_branch_bb:
1492  // spill s[8:9]
1493  // s_getpc_b64 s[8:9]
1494  // s_add_u32 s8, s8, restore_bb
1495  // s_addc_u32 s9, s9, 0
1496  // s_setpc_b64 s[8:9]
1497  //
1498  // skip_long_branch:
1499  // foo;
1500  //
1501  // .....
1502  //
1503  // dest_bb_fallthrough_predecessor:
1504  // bar;
1505  // s_branch dest_bb
1506  //
1507  // restore_bb:
1508  // restore s[8:9]
1509  // fallthrough dest_bb
1510  ///
1511  // dest_bb:
1512  // buzz;
1513 
1514  RS->enterBasicBlockEnd(MBB);
1515  unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
1516  MachineBasicBlock::iterator(GetPC), 0);
1517  MRI.replaceRegWith(PCReg, Scav);
1518  MRI.clearVirtRegs();
1519  RS->setRegUsed(Scav);
1520 
1521  return 4 + 8 + 4 + 4;
1522 }
1523 
1524 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1525  switch (Cond) {
1526  case SIInstrInfo::SCC_TRUE:
1527  return AMDGPU::S_CBRANCH_SCC1;
1528  case SIInstrInfo::SCC_FALSE:
1529  return AMDGPU::S_CBRANCH_SCC0;
1530  case SIInstrInfo::VCCNZ:
1531  return AMDGPU::S_CBRANCH_VCCNZ;
1532  case SIInstrInfo::VCCZ:
1533  return AMDGPU::S_CBRANCH_VCCZ;
1534  case SIInstrInfo::EXECNZ:
1535  return AMDGPU::S_CBRANCH_EXECNZ;
1536  case SIInstrInfo::EXECZ:
1537  return AMDGPU::S_CBRANCH_EXECZ;
1538  default:
1539  llvm_unreachable("invalid branch predicate");
1540  }
1541 }
1542 
1543 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1544  switch (Opcode) {
1545  case AMDGPU::S_CBRANCH_SCC0:
1546  return SCC_FALSE;
1547  case AMDGPU::S_CBRANCH_SCC1:
1548  return SCC_TRUE;
1549  case AMDGPU::S_CBRANCH_VCCNZ:
1550  return VCCNZ;
1551  case AMDGPU::S_CBRANCH_VCCZ:
1552  return VCCZ;
1553  case AMDGPU::S_CBRANCH_EXECNZ:
1554  return EXECNZ;
1555  case AMDGPU::S_CBRANCH_EXECZ:
1556  return EXECZ;
1557  default:
1558  return INVALID_BR;
1559  }
1560 }
1561 
1564  MachineBasicBlock *&TBB,
1565  MachineBasicBlock *&FBB,
1567  bool AllowModify) const {
1568  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1569  // Unconditional Branch
1570  TBB = I->getOperand(0).getMBB();
1571  return false;
1572  }
1573 
1574  MachineBasicBlock *CondBB = nullptr;
1575 
1576  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1577  CondBB = I->getOperand(1).getMBB();
1578  Cond.push_back(I->getOperand(0));
1579  } else {
1580  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1581  if (Pred == INVALID_BR)
1582  return true;
1583 
1584  CondBB = I->getOperand(0).getMBB();
1586  Cond.push_back(I->getOperand(1)); // Save the branch register.
1587  }
1588  ++I;
1589 
1590  if (I == MBB.end()) {
1591  // Conditional branch followed by fall-through.
1592  TBB = CondBB;
1593  return false;
1594  }
1595 
1596  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1597  TBB = CondBB;
1598  FBB = I->getOperand(0).getMBB();
1599  return false;
1600  }
1601 
1602  return true;
1603 }
1604 
1606  MachineBasicBlock *&FBB,
1608  bool AllowModify) const {
1610  if (I == MBB.end())
1611  return false;
1612 
1613  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1614  return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1615 
1616  ++I;
1617 
1618  // TODO: Should be able to treat as fallthrough?
1619  if (I == MBB.end())
1620  return true;
1621 
1622  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1623  return true;
1624 
1625  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1626 
1627  // Specifically handle the case where the conditional branch is to the same
1628  // destination as the mask branch. e.g.
1629  //
1630  // si_mask_branch BB8
1631  // s_cbranch_execz BB8
1632  // s_cbranch BB9
1633  //
1634  // This is required to understand divergent loops which may need the branches
1635  // to be relaxed.
1636  if (TBB != MaskBrDest || Cond.empty())
1637  return true;
1638 
1639  auto Pred = Cond[0].getImm();
1640  return (Pred != EXECZ && Pred != EXECNZ);
1641 }
1642 
1644  int *BytesRemoved) const {
1646 
1647  unsigned Count = 0;
1648  unsigned RemovedSize = 0;
1649  while (I != MBB.end()) {
1650  MachineBasicBlock::iterator Next = std::next(I);
1651  if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1652  I = Next;
1653  continue;
1654  }
1655 
1656  RemovedSize += getInstSizeInBytes(*I);
1657  I->eraseFromParent();
1658  ++Count;
1659  I = Next;
1660  }
1661 
1662  if (BytesRemoved)
1663  *BytesRemoved = RemovedSize;
1664 
1665  return Count;
1666 }
1667 
1668 // Copy the flags onto the implicit condition register operand.
1670  const MachineOperand &OrigCond) {
1671  CondReg.setIsUndef(OrigCond.isUndef());
1672  CondReg.setIsKill(OrigCond.isKill());
1673 }
1674 
1676  MachineBasicBlock *TBB,
1677  MachineBasicBlock *FBB,
1679  const DebugLoc &DL,
1680  int *BytesAdded) const {
1681  if (!FBB && Cond.empty()) {
1682  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1683  .addMBB(TBB);
1684  if (BytesAdded)
1685  *BytesAdded = 4;
1686  return 1;
1687  }
1688 
1689  if(Cond.size() == 1 && Cond[0].isReg()) {
1690  BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1691  .add(Cond[0])
1692  .addMBB(TBB);
1693  return 1;
1694  }
1695 
1696  assert(TBB && Cond[0].isImm());
1697 
1698  unsigned Opcode
1699  = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1700 
1701  if (!FBB) {
1702  Cond[1].isUndef();
1703  MachineInstr *CondBr =
1704  BuildMI(&MBB, DL, get(Opcode))
1705  .addMBB(TBB);
1706 
1707  // Copy the flags onto the implicit condition register operand.
1708  preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1709 
1710  if (BytesAdded)
1711  *BytesAdded = 4;
1712  return 1;
1713  }
1714 
1715  assert(TBB && FBB);
1716 
1717  MachineInstr *CondBr =
1718  BuildMI(&MBB, DL, get(Opcode))
1719  .addMBB(TBB);
1720  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1721  .addMBB(FBB);
1722 
1723  MachineOperand &CondReg = CondBr->getOperand(1);
1724  CondReg.setIsUndef(Cond[1].isUndef());
1725  CondReg.setIsKill(Cond[1].isKill());
1726 
1727  if (BytesAdded)
1728  *BytesAdded = 8;
1729 
1730  return 2;
1731 }
1732 
1734  SmallVectorImpl<MachineOperand> &Cond) const {
1735  if (Cond.size() != 2) {
1736  return true;
1737  }
1738 
1739  if (Cond[0].isImm()) {
1740  Cond[0].setImm(-Cond[0].getImm());
1741  return false;
1742  }
1743 
1744  return true;
1745 }
1746 
1749  unsigned TrueReg, unsigned FalseReg,
1750  int &CondCycles,
1751  int &TrueCycles, int &FalseCycles) const {
1752  switch (Cond[0].getImm()) {
1753  case VCCNZ:
1754  case VCCZ: {
1755  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1756  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1757  assert(MRI.getRegClass(FalseReg) == RC);
1758 
1759  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1760  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1761 
1762  // Limit to equal cost for branch vs. N v_cndmask_b32s.
1763  return !RI.isSGPRClass(RC) && NumInsts <= 6;
1764  }
1765  case SCC_TRUE:
1766  case SCC_FALSE: {
1767  // FIXME: We could insert for VGPRs if we could replace the original compare
1768  // with a vector one.
1769  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1770  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1771  assert(MRI.getRegClass(FalseReg) == RC);
1772 
1773  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1774 
1775  // Multiples of 8 can do s_cselect_b64
1776  if (NumInsts % 2 == 0)
1777  NumInsts /= 2;
1778 
1779  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1780  return RI.isSGPRClass(RC);
1781  }
1782  default:
1783  return false;
1784  }
1785 }
1786 
1789  unsigned DstReg, ArrayRef<MachineOperand> Cond,
1790  unsigned TrueReg, unsigned FalseReg) const {
1791  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1792  if (Pred == VCCZ || Pred == SCC_FALSE) {
1793  Pred = static_cast<BranchPredicate>(-Pred);
1794  std::swap(TrueReg, FalseReg);
1795  }
1796 
1798  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1799  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1800 
1801  if (DstSize == 32) {
1802  unsigned SelOp = Pred == SCC_TRUE ?
1803  AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1804 
1805  // Instruction's operands are backwards from what is expected.
1806  MachineInstr *Select =
1807  BuildMI(MBB, I, DL, get(SelOp), DstReg)
1808  .addReg(FalseReg)
1809  .addReg(TrueReg);
1810 
1811  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1812  return;
1813  }
1814 
1815  if (DstSize == 64 && Pred == SCC_TRUE) {
1816  MachineInstr *Select =
1817  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1818  .addReg(FalseReg)
1819  .addReg(TrueReg);
1820 
1821  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1822  return;
1823  }
1824 
1825  static const int16_t Sub0_15[] = {
1826  AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1827  AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1828  AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1829  AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1830  };
1831 
1832  static const int16_t Sub0_15_64[] = {
1833  AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1834  AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1835  AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1836  AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1837  };
1838 
1839  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1840  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1841  const int16_t *SubIndices = Sub0_15;
1842  int NElts = DstSize / 32;
1843 
1844  // 64-bit select is only avaialble for SALU.
1845  if (Pred == SCC_TRUE) {
1846  SelOp = AMDGPU::S_CSELECT_B64;
1847  EltRC = &AMDGPU::SGPR_64RegClass;
1848  SubIndices = Sub0_15_64;
1849 
1850  assert(NElts % 2 == 0);
1851  NElts /= 2;
1852  }
1853 
1855  MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1856 
1857  I = MIB->getIterator();
1858 
1860  for (int Idx = 0; Idx != NElts; ++Idx) {
1861  unsigned DstElt = MRI.createVirtualRegister(EltRC);
1862  Regs.push_back(DstElt);
1863 
1864  unsigned SubIdx = SubIndices[Idx];
1865 
1866  MachineInstr *Select =
1867  BuildMI(MBB, I, DL, get(SelOp), DstElt)
1868  .addReg(FalseReg, 0, SubIdx)
1869  .addReg(TrueReg, 0, SubIdx);
1870  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1871 
1872  MIB.addReg(DstElt)
1873  .addImm(SubIdx);
1874  }
1875 }
1876 
1878  switch (MI.getOpcode()) {
1879  case AMDGPU::V_MOV_B32_e32:
1880  case AMDGPU::V_MOV_B32_e64:
1881  case AMDGPU::V_MOV_B64_PSEUDO: {
1882  // If there are additional implicit register operands, this may be used for
1883  // register indexing so the source register operand isn't simply copied.
1884  unsigned NumOps = MI.getDesc().getNumOperands() +
1885  MI.getDesc().getNumImplicitUses();
1886 
1887  return MI.getNumOperands() == NumOps;
1888  }
1889  case AMDGPU::S_MOV_B32:
1890  case AMDGPU::S_MOV_B64:
1891  case AMDGPU::COPY:
1892  return true;
1893  default:
1894  return false;
1895  }
1896 }
1897 
1900  switch(Kind) {
1903  return AMDGPUASI.PRIVATE_ADDRESS;
1910  return AMDGPUASI.CONSTANT_ADDRESS;
1911  }
1912  return AMDGPUASI.FLAT_ADDRESS;
1913 }
1914 
1916  unsigned Opc = MI.getOpcode();
1917  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1918  AMDGPU::OpName::src0_modifiers);
1919  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1920  AMDGPU::OpName::src1_modifiers);
1921  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1922  AMDGPU::OpName::src2_modifiers);
1923 
1924  MI.RemoveOperand(Src2ModIdx);
1925  MI.RemoveOperand(Src1ModIdx);
1926  MI.RemoveOperand(Src0ModIdx);
1927 }
1928 
1930  unsigned Reg, MachineRegisterInfo *MRI) const {
1931  if (!MRI->hasOneNonDBGUse(Reg))
1932  return false;
1933 
1934  switch (DefMI.getOpcode()) {
1935  default:
1936  return false;
1937  case AMDGPU::S_MOV_B64:
1938  // TODO: We could fold 64-bit immediates, but this get compilicated
1939  // when there are sub-registers.
1940  return false;
1941 
1942  case AMDGPU::V_MOV_B32_e32:
1943  case AMDGPU::S_MOV_B32:
1944  break;
1945  }
1946 
1947  const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
1948  assert(ImmOp);
1949  // FIXME: We could handle FrameIndex values here.
1950  if (!ImmOp->isImm())
1951  return false;
1952 
1953  unsigned Opc = UseMI.getOpcode();
1954  if (Opc == AMDGPU::COPY) {
1955  bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
1956  unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1957  UseMI.setDesc(get(NewOpc));
1958  UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
1959  UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
1960  return true;
1961  }
1962 
1963  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
1964  Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
1965  // Don't fold if we are using source or output modifiers. The new VOP2
1966  // instructions don't have them.
1967  if (hasAnyModifiersSet(UseMI))
1968  return false;
1969 
1970  // If this is a free constant, there's no reason to do this.
1971  // TODO: We could fold this here instead of letting SIFoldOperands do it
1972  // later.
1973  MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
1974 
1975  // Any src operand can be used for the legality check.
1976  if (isInlineConstant(UseMI, *Src0, *ImmOp))
1977  return false;
1978 
1979  bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
1980  MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
1981  MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
1982 
1983  // Multiplied part is the constant: Use v_madmk_{f16, f32}.
1984  // We should only expect these to be on src0 due to canonicalizations.
1985  if (Src0->isReg() && Src0->getReg() == Reg) {
1986  if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1987  return false;
1988 
1989  if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
1990  return false;
1991 
1992  // We need to swap operands 0 and 1 since madmk constant is at operand 1.
1993 
1994  const int64_t Imm = ImmOp->getImm();
1995 
1996  // FIXME: This would be a lot easier if we could return a new instruction
1997  // instead of having to modify in place.
1998 
1999  // Remove these first since they are at the end.
2000  UseMI.RemoveOperand(
2001  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2002  UseMI.RemoveOperand(
2003  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2004 
2005  unsigned Src1Reg = Src1->getReg();
2006  unsigned Src1SubReg = Src1->getSubReg();
2007  Src0->setReg(Src1Reg);
2008  Src0->setSubReg(Src1SubReg);
2009  Src0->setIsKill(Src1->isKill());
2010 
2011  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2012  Opc == AMDGPU::V_MAC_F16_e64)
2013  UseMI.untieRegOperand(
2014  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2015 
2016  Src1->ChangeToImmediate(Imm);
2017 
2018  removeModOperands(UseMI);
2019  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
2020 
2021  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2022  if (DeleteDef)
2023  DefMI.eraseFromParent();
2024 
2025  return true;
2026  }
2027 
2028  // Added part is the constant: Use v_madak_{f16, f32}.
2029  if (Src2->isReg() && Src2->getReg() == Reg) {
2030  // Not allowed to use constant bus for another operand.
2031  // We can however allow an inline immediate as src0.
2032  if (!Src0->isImm() &&
2033  (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
2034  return false;
2035 
2036  if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2037  return false;
2038 
2039  const int64_t Imm = ImmOp->getImm();
2040 
2041  // FIXME: This would be a lot easier if we could return a new instruction
2042  // instead of having to modify in place.
2043 
2044  // Remove these first since they are at the end.
2045  UseMI.RemoveOperand(
2046  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2047  UseMI.RemoveOperand(
2048  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2049 
2050  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2051  Opc == AMDGPU::V_MAC_F16_e64)
2052  UseMI.untieRegOperand(
2053  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2054 
2055  // ChangingToImmediate adds Src2 back to the instruction.
2056  Src2->ChangeToImmediate(Imm);
2057 
2058  // These come before src2.
2059  removeModOperands(UseMI);
2060  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
2061 
2062  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2063  if (DeleteDef)
2064  DefMI.eraseFromParent();
2065 
2066  return true;
2067  }
2068  }
2069 
2070  return false;
2071 }
2072 
2073 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2074  int WidthB, int OffsetB) {
2075  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2076  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2077  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2078  return LowOffset + LowWidth <= HighOffset;
2079 }
2080 
2081 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
2082  MachineInstr &MIb) const {
2083  unsigned BaseReg0, BaseReg1;
2084  int64_t Offset0, Offset1;
2085 
2086  if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
2087  getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
2088 
2089  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2090  // FIXME: Handle ds_read2 / ds_write2.
2091  return false;
2092  }
2093  unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2094  unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2095  if (BaseReg0 == BaseReg1 &&
2096  offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
2097  return true;
2098  }
2099  }
2100 
2101  return false;
2102 }
2103 
2105  MachineInstr &MIb,
2106  AliasAnalysis *AA) const {
2107  assert((MIa.mayLoad() || MIa.mayStore()) &&
2108  "MIa must load from or modify a memory location");
2109  assert((MIb.mayLoad() || MIb.mayStore()) &&
2110  "MIb must load from or modify a memory location");
2111 
2113  return false;
2114 
2115  // XXX - Can we relax this between address spaces?
2116  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2117  return false;
2118 
2119  if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
2120  const MachineMemOperand *MMOa = *MIa.memoperands_begin();
2121  const MachineMemOperand *MMOb = *MIb.memoperands_begin();
2122  if (MMOa->getValue() && MMOb->getValue()) {
2123  MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
2124  MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
2125  if (!AA->alias(LocA, LocB))
2126  return true;
2127  }
2128  }
2129 
2130  // TODO: Should we check the address space from the MachineMemOperand? That
2131  // would allow us to distinguish objects we know don't alias based on the
2132  // underlying address space, even if it was lowered to a different one,
2133  // e.g. private accesses lowered to use MUBUF instructions on a scratch
2134  // buffer.
2135  if (isDS(MIa)) {
2136  if (isDS(MIb))
2137  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2138 
2139  return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
2140  }
2141 
2142  if (isMUBUF(MIa) || isMTBUF(MIa)) {
2143  if (isMUBUF(MIb) || isMTBUF(MIb))
2144  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2145 
2146  return !isFLAT(MIb) && !isSMRD(MIb);
2147  }
2148 
2149  if (isSMRD(MIa)) {
2150  if (isSMRD(MIb))
2151  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2152 
2153  return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2154  }
2155 
2156  if (isFLAT(MIa)) {
2157  if (isFLAT(MIb))
2158  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2159 
2160  return false;
2161  }
2162 
2163  return false;
2164 }
2165 
2166 static int64_t getFoldableImm(const MachineOperand* MO) {
2167  if (!MO->isReg())
2168  return false;
2169  const MachineFunction *MF = MO->getParent()->getParent()->getParent();
2170  const MachineRegisterInfo &MRI = MF->getRegInfo();
2171  auto Def = MRI.getUniqueVRegDef(MO->getReg());
2172  if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
2173  Def->getOperand(1).isImm())
2174  return Def->getOperand(1).getImm();
2175  return AMDGPU::NoRegister;
2176 }
2177 
2179  MachineInstr &MI,
2180  LiveVariables *LV) const {
2181  unsigned Opc = MI.getOpcode();
2182  bool IsF16 = false;
2183  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
2184 
2185  switch (Opc) {
2186  default:
2187  return nullptr;
2188  case AMDGPU::V_MAC_F16_e64:
2189  IsF16 = true;
2191  case AMDGPU::V_MAC_F32_e64:
2192  case AMDGPU::V_FMAC_F32_e64:
2193  break;
2194  case AMDGPU::V_MAC_F16_e32:
2195  IsF16 = true;
2197  case AMDGPU::V_MAC_F32_e32:
2198  case AMDGPU::V_FMAC_F32_e32: {
2199  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2200  AMDGPU::OpName::src0);
2201  const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2202  if (!Src0->isReg() && !Src0->isImm())
2203  return nullptr;
2204 
2205  if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2206  return nullptr;
2207 
2208  break;
2209  }
2210  }
2211 
2212  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2213  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2214  const MachineOperand *Src0Mods =
2215  getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2216  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2217  const MachineOperand *Src1Mods =
2218  getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2219  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2220  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2221  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2222 
2223  if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
2224  // If we have an SGPR input, we will violate the constant bus restriction.
2225  (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
2226  if (auto Imm = getFoldableImm(Src2)) {
2227  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2228  get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
2229  .add(*Dst)
2230  .add(*Src0)
2231  .add(*Src1)
2232  .addImm(Imm);
2233  }
2234  if (auto Imm = getFoldableImm(Src1)) {
2235  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2236  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2237  .add(*Dst)
2238  .add(*Src0)
2239  .addImm(Imm)
2240  .add(*Src2);
2241  }
2242  if (auto Imm = getFoldableImm(Src0)) {
2243  if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
2244  AMDGPU::OpName::src0), Src1))
2245  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2246  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2247  .add(*Dst)
2248  .add(*Src1)
2249  .addImm(Imm)
2250  .add(*Src2);
2251  }
2252  }
2253 
2254  assert((!IsFMA || !IsF16) && "fmac only expected with f32");
2255  unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
2256  (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2257  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2258  .add(*Dst)
2259  .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2260  .add(*Src0)
2261  .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2262  .add(*Src1)
2263  .addImm(0) // Src mods
2264  .add(*Src2)
2265  .addImm(Clamp ? Clamp->getImm() : 0)
2266  .addImm(Omod ? Omod->getImm() : 0);
2267 }
2268 
2269 // It's not generally safe to move VALU instructions across these since it will
2270 // start using the register as a base index rather than directly.
2271 // XXX - Why isn't hasSideEffects sufficient for these?
2273  switch (MI.getOpcode()) {
2274  case AMDGPU::S_SET_GPR_IDX_ON:
2275  case AMDGPU::S_SET_GPR_IDX_MODE:
2276  case AMDGPU::S_SET_GPR_IDX_OFF:
2277  return true;
2278  default:
2279  return false;
2280  }
2281 }
2282 
2284  const MachineBasicBlock *MBB,
2285  const MachineFunction &MF) const {
2286  // XXX - Do we want the SP check in the base implementation?
2287 
2288  // Target-independent instructions do not have an implicit-use of EXEC, even
2289  // when they operate on VGPRs. Treating EXEC modifications as scheduling
2290  // boundaries prevents incorrect movements of such instructions.
2291  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2292  MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2293  MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2294  MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2296 }
2297 
2298 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2299  switch (Imm.getBitWidth()) {
2300  case 32:
2302  ST.hasInv2PiInlineImm());
2303  case 64:
2305  ST.hasInv2PiInlineImm());
2306  case 16:
2307  return ST.has16BitInsts() &&
2309  ST.hasInv2PiInlineImm());
2310  default:
2311  llvm_unreachable("invalid bitwidth");
2312  }
2313 }
2314 
2316  uint8_t OperandType) const {
2317  if (!MO.isImm() ||
2318  OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2319  OperandType > AMDGPU::OPERAND_SRC_LAST)
2320  return false;
2321 
2322  // MachineOperand provides no way to tell the true operand size, since it only
2323  // records a 64-bit value. We need to know the size to determine if a 32-bit
2324  // floating point immediate bit pattern is legal for an integer immediate. It
2325  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2326 
2327  int64_t Imm = MO.getImm();
2328  switch (OperandType) {
2333  int32_t Trunc = static_cast<int32_t>(Imm);
2334  return Trunc == Imm &&
2336  }
2342  ST.hasInv2PiInlineImm());
2347  if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2348  // A few special case instructions have 16-bit operands on subtargets
2349  // where 16-bit instructions are not legal.
2350  // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2351  // constants in these cases
2352  int16_t Trunc = static_cast<int16_t>(Imm);
2353  return ST.has16BitInsts() &&
2355  }
2356 
2357  return false;
2358  }
2361  if (isUInt<16>(Imm)) {
2362  int16_t Trunc = static_cast<int16_t>(Imm);
2363  return ST.has16BitInsts() &&
2365  }
2366  if (!(Imm & 0xffff)) {
2367  return ST.has16BitInsts() &&
2369  }
2370  uint32_t Trunc = static_cast<uint32_t>(Imm);
2372  }
2373  default:
2374  llvm_unreachable("invalid bitwidth");
2375  }
2376 }
2377 
2379  const MCOperandInfo &OpInfo) const {
2380  switch (MO.getType()) {
2382  return false;
2384  return !isInlineConstant(MO, OpInfo);
2390  return true;
2391  default:
2392  llvm_unreachable("unexpected operand type");
2393  }
2394 }
2395 
2396 static bool compareMachineOp(const MachineOperand &Op0,
2397  const MachineOperand &Op1) {
2398  if (Op0.getType() != Op1.getType())
2399  return false;
2400 
2401  switch (Op0.getType()) {
2403  return Op0.getReg() == Op1.getReg();
2405  return Op0.getImm() == Op1.getImm();
2406  default:
2407  llvm_unreachable("Didn't expect to be comparing these operand types");
2408  }
2409 }
2410 
2412  const MachineOperand &MO) const {
2413  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2414 
2415  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2416 
2417  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2418  return true;
2419 
2420  if (OpInfo.RegClass < 0)
2421  return false;
2422 
2423  if (MO.isImm() && isInlineConstant(MO, OpInfo))
2424  return RI.opCanUseInlineConstant(OpInfo.OperandType);
2425 
2426  return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2427 }
2428 
2429 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2430  int Op32 = AMDGPU::getVOPe32(Opcode);
2431  if (Op32 == -1)
2432  return false;
2433 
2434  return pseudoToMCOpcode(Op32) != -1;
2435 }
2436 
2437 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2438  // The src0_modifier operand is present on all instructions
2439  // that have modifiers.
2440 
2441  return AMDGPU::getNamedOperandIdx(Opcode,
2442  AMDGPU::OpName::src0_modifiers) != -1;
2443 }
2444 
2446  unsigned OpName) const {
2447  const MachineOperand *Mods = getNamedOperand(MI, OpName);
2448  return Mods && Mods->getImm();
2449 }
2450 
2452  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2453  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2454  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2455  hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2456  hasModifiersSet(MI, AMDGPU::OpName::omod);
2457 }
2458 
2460  const MachineOperand &MO,
2461  const MCOperandInfo &OpInfo) const {
2462  // Literal constants use the constant bus.
2463  //if (isLiteralConstantLike(MO, OpInfo))
2464  // return true;
2465  if (MO.isImm())
2466  return !isInlineConstant(MO, OpInfo);
2467 
2468  if (!MO.isReg())
2469  return true; // Misc other operands like FrameIndex
2470 
2471  if (!MO.isUse())
2472  return false;
2473 
2475  return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2476 
2477  // FLAT_SCR is just an SGPR pair.
2478  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2479  return true;
2480 
2481  // EXEC register uses the constant bus.
2482  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2483  return true;
2484 
2485  // SGPRs use the constant bus
2486  return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2487  (!MO.isImplicit() &&
2488  (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2489  AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2490 }
2491 
2492 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2493  for (const MachineOperand &MO : MI.implicit_operands()) {
2494  // We only care about reads.
2495  if (MO.isDef())
2496  continue;
2497 
2498  switch (MO.getReg()) {
2499  case AMDGPU::VCC:
2500  case AMDGPU::M0:
2501  case AMDGPU::FLAT_SCR:
2502  return MO.getReg();
2503 
2504  default:
2505  break;
2506  }
2507  }
2508 
2509  return AMDGPU::NoRegister;
2510 }
2511 
2512 static bool shouldReadExec(const MachineInstr &MI) {
2513  if (SIInstrInfo::isVALU(MI)) {
2514  switch (MI.getOpcode()) {
2515  case AMDGPU::V_READLANE_B32:
2516  case AMDGPU::V_READLANE_B32_si:
2517  case AMDGPU::V_READLANE_B32_vi:
2518  case AMDGPU::V_WRITELANE_B32:
2519  case AMDGPU::V_WRITELANE_B32_si:
2520  case AMDGPU::V_WRITELANE_B32_vi:
2521  return false;
2522  }
2523 
2524  return true;
2525  }
2526 
2527  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2528  SIInstrInfo::isSALU(MI) ||
2529  SIInstrInfo::isSMRD(MI))
2530  return false;
2531 
2532  return true;
2533 }
2534 
2535 static bool isSubRegOf(const SIRegisterInfo &TRI,
2536  const MachineOperand &SuperVec,
2537  const MachineOperand &SubReg) {
2539  return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2540 
2541  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2542  SubReg.getReg() == SuperVec.getReg();
2543 }
2544 
2546  StringRef &ErrInfo) const {
2547  uint16_t Opcode = MI.getOpcode();
2548  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2549  return true;
2550 
2551  const MachineFunction *MF = MI.getParent()->getParent();
2552  const MachineRegisterInfo &MRI = MF->getRegInfo();
2553 
2554  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2555  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2556  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2557 
2558  // Make sure the number of operands is correct.
2559  const MCInstrDesc &Desc = get(Opcode);
2560  if (!Desc.isVariadic() &&
2561  Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2562  ErrInfo = "Instruction has wrong number of operands.";
2563  return false;
2564  }
2565 
2566  if (MI.isInlineAsm()) {
2567  // Verify register classes for inlineasm constraints.
2568  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2569  I != E; ++I) {
2570  const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2571  if (!RC)
2572  continue;
2573 
2574  const MachineOperand &Op = MI.getOperand(I);
2575  if (!Op.isReg())
2576  continue;
2577 
2578  unsigned Reg = Op.getReg();
2579  if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2580  ErrInfo = "inlineasm operand has incorrect register class.";
2581  return false;
2582  }
2583  }
2584 
2585  return true;
2586  }
2587 
2588  // Make sure the register classes are correct.
2589  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2590  if (MI.getOperand(i).isFPImm()) {
2591  ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2592  "all fp values to integers.";
2593  return false;
2594  }
2595 
2596  int RegClass = Desc.OpInfo[i].RegClass;
2597 
2598  switch (Desc.OpInfo[i].OperandType) {
2600  if (MI.getOperand(i).isImm()) {
2601  ErrInfo = "Illegal immediate value for operand.";
2602  return false;
2603  }
2604  break;
2607  break;
2614  const MachineOperand &MO = MI.getOperand(i);
2615  if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2616  ErrInfo = "Illegal immediate value for operand.";
2617  return false;
2618  }
2619  break;
2620  }
2623  // Check if this operand is an immediate.
2624  // FrameIndex operands will be replaced by immediates, so they are
2625  // allowed.
2626  if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2627  ErrInfo = "Expected immediate, but got non-immediate";
2628  return false;
2629  }
2631  default:
2632  continue;
2633  }
2634 
2635  if (!MI.getOperand(i).isReg())
2636  continue;
2637 
2638  if (RegClass != -1) {
2639  unsigned Reg = MI.getOperand(i).getReg();
2640  if (Reg == AMDGPU::NoRegister ||
2642  continue;
2643 
2644  const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2645  if (!RC->contains(Reg)) {
2646  ErrInfo = "Operand has incorrect register class.";
2647  return false;
2648  }
2649  }
2650  }
2651 
2652  // Verify SDWA
2653  if (isSDWA(MI)) {
2654  if (!ST.hasSDWA()) {
2655  ErrInfo = "SDWA is not supported on this target";
2656  return false;
2657  }
2658 
2659  int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2660 
2661  const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2662 
2663  for (int OpIdx: OpIndicies) {
2664  if (OpIdx == -1)
2665  continue;
2666  const MachineOperand &MO = MI.getOperand(OpIdx);
2667 
2668  if (!ST.hasSDWAScalar()) {
2669  // Only VGPRS on VI
2670  if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
2671  ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2672  return false;
2673  }
2674  } else {
2675  // No immediates on GFX9
2676  if (!MO.isReg()) {
2677  ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2678  return false;
2679  }
2680  }
2681  }
2682 
2683  if (!ST.hasSDWAOmod()) {
2684  // No omod allowed on VI
2685  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2686  if (OMod != nullptr &&
2687  (!OMod->isImm() || OMod->getImm() != 0)) {
2688  ErrInfo = "OMod not allowed in SDWA instructions on VI";
2689  return false;
2690  }
2691  }
2692 
2693  uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
2694  if (isVOPC(BasicOpcode)) {
2695  if (!ST.hasSDWASdst() && DstIdx != -1) {
2696  // Only vcc allowed as dst on VI for VOPC
2697  const MachineOperand &Dst = MI.getOperand(DstIdx);
2698  if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
2699  ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
2700  return false;
2701  }
2702  } else if (!ST.hasSDWAOutModsVOPC()) {
2703  // No clamp allowed on GFX9 for VOPC
2704  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2705  if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
2706  ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
2707  return false;
2708  }
2709 
2710  // No omod allowed on GFX9 for VOPC
2711  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2712  if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
2713  ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
2714  return false;
2715  }
2716  }
2717  }
2718 
2719  const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
2720  if (DstUnused && DstUnused->isImm() &&
2721  DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
2722  const MachineOperand &Dst = MI.getOperand(DstIdx);
2723  if (!Dst.isReg() || !Dst.isTied()) {
2724  ErrInfo = "Dst register should have tied register";
2725  return false;
2726  }
2727 
2728  const MachineOperand &TiedMO =
2729  MI.getOperand(MI.findTiedOperandIdx(DstIdx));
2730  if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
2731  ErrInfo =
2732  "Dst register should be tied to implicit use of preserved register";
2733  return false;
2734  } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
2735  Dst.getReg() != TiedMO.getReg()) {
2736  ErrInfo = "Dst register should use same physical register as preserved";
2737  return false;
2738  }
2739  }
2740  }
2741 
2742  // Verify VOP*. Ignore multiple sgpr operands on writelane.
2743  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
2744  && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
2745  // Only look at the true operands. Only a real operand can use the constant
2746  // bus, and we don't want to check pseudo-operands like the source modifier
2747  // flags.
2748  const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
2749 
2750  unsigned ConstantBusCount = 0;
2751  unsigned LiteralCount = 0;
2752 
2753  if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
2754  ++ConstantBusCount;
2755 
2756  unsigned SGPRUsed = findImplicitSGPRRead(MI);
2757  if (SGPRUsed != AMDGPU::NoRegister)
2758  ++ConstantBusCount;
2759 
2760  for (int OpIdx : OpIndices) {
2761  if (OpIdx == -1)
2762  break;
2763  const MachineOperand &MO = MI.getOperand(OpIdx);
2764  if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
2765  if (MO.isReg()) {
2766  if (MO.getReg() != SGPRUsed)
2767  ++ConstantBusCount;
2768  SGPRUsed = MO.getReg();
2769  } else {
2770  ++ConstantBusCount;
2771  ++LiteralCount;
2772  }
2773  }
2774  }
2775  if (ConstantBusCount > 1) {
2776  ErrInfo = "VOP* instruction uses the constant bus more than once";
2777  return false;
2778  }
2779 
2780  if (isVOP3(MI) && LiteralCount) {
2781  ErrInfo = "VOP3 instruction uses literal";
2782  return false;
2783  }
2784  }
2785 
2786  // Verify misc. restrictions on specific instructions.
2787  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
2788  Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
2789  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2790  const MachineOperand &Src1 = MI.getOperand(Src1Idx);
2791  const MachineOperand &Src2 = MI.getOperand(Src2Idx);
2792  if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
2793  if (!compareMachineOp(Src0, Src1) &&
2794  !compareMachineOp(Src0, Src2)) {
2795  ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
2796  return false;
2797  }
2798  }
2799  }
2800 
2801  if (isSOPK(MI)) {
2802  int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
2803  if (sopkIsZext(MI)) {
2804  if (!isUInt<16>(Imm)) {
2805  ErrInfo = "invalid immediate for SOPK instruction";
2806  return false;
2807  }
2808  } else {
2809  if (!isInt<16>(Imm)) {
2810  ErrInfo = "invalid immediate for SOPK instruction";
2811  return false;
2812  }
2813  }
2814  }
2815 
2816  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
2817  Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
2818  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2819  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
2820  const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2821  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
2822 
2823  const unsigned StaticNumOps = Desc.getNumOperands() +
2824  Desc.getNumImplicitUses();
2825  const unsigned NumImplicitOps = IsDst ? 2 : 1;
2826 
2827  // Allow additional implicit operands. This allows a fixup done by the post
2828  // RA scheduler where the main implicit operand is killed and implicit-defs
2829  // are added for sub-registers that remain live after this instruction.
2830  if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
2831  ErrInfo = "missing implicit register operands";
2832  return false;
2833  }
2834 
2835  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2836  if (IsDst) {
2837  if (!Dst->isUse()) {
2838  ErrInfo = "v_movreld_b32 vdst should be a use operand";
2839  return false;
2840  }
2841 
2842  unsigned UseOpIdx;
2843  if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
2844  UseOpIdx != StaticNumOps + 1) {
2845  ErrInfo = "movrel implicit operands should be tied";
2846  return false;
2847  }
2848  }
2849 
2850  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2851  const MachineOperand &ImpUse
2852  = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
2853  if (!ImpUse.isReg() || !ImpUse.isUse() ||
2854  !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
2855  ErrInfo = "src0 should be subreg of implicit vector use";
2856  return false;
2857  }
2858  }
2859 
2860  // Make sure we aren't losing exec uses in the td files. This mostly requires
2861  // being careful when using let Uses to try to add other use registers.
2862  if (shouldReadExec(MI)) {
2863  if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
2864  ErrInfo = "VALU instruction does not implicitly read exec mask";
2865  return false;
2866  }
2867  }
2868 
2869  if (isSMRD(MI)) {
2870  if (MI.mayStore()) {
2871  // The register offset form of scalar stores may only use m0 as the
2872  // soffset register.
2873  const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
2874  if (Soff && Soff->getReg() != AMDGPU::M0) {
2875  ErrInfo = "scalar stores must use m0 as offset register";
2876  return false;
2877  }
2878  }
2879  }
2880 
2881  if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
2882  const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
2883  if (Offset->getImm() != 0) {
2884  ErrInfo = "subtarget does not support offsets in flat instructions";
2885  return false;
2886  }
2887  }
2888 
2889  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
2890  if (DppCt) {
2891  using namespace AMDGPU::DPP;
2892 
2893  unsigned DC = DppCt->getImm();
2894  if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
2895  DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
2900  ErrInfo = "Invalid dpp_ctrl value";
2901  return false;
2902  }
2903  }
2904 
2905  return true;
2906 }
2907 
2908 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
2909  switch (MI.getOpcode()) {
2910  default: return AMDGPU::INSTRUCTION_LIST_END;
2911  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
2912  case AMDGPU::COPY: return AMDGPU::COPY;
2913  case AMDGPU::PHI: return AMDGPU::PHI;
2914  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
2915  case AMDGPU::WQM: return AMDGPU::WQM;
2916  case AMDGPU::WWM: return AMDGPU::WWM;
2917  case AMDGPU::S_MOV_B32:
2918  return MI.getOperand(1).isReg() ?
2919  AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
2920  case AMDGPU::S_ADD_I32:
2921  return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
2922  case AMDGPU::S_ADDC_U32:
2923  return AMDGPU::V_ADDC_U32_e32;
2924  case AMDGPU::S_SUB_I32:
2925  return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
2926  // FIXME: These are not consistently handled, and selected when the carry is
2927  // used.
2928  case AMDGPU::S_ADD_U32:
2929  return AMDGPU::V_ADD_I32_e32;
2930  case AMDGPU::S_SUB_U32:
2931  return AMDGPU::V_SUB_I32_e32;
2932  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
2933  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
2934  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
2935  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
2936  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
2937  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
2938  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
2939  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
2940  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
2941  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
2942  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
2943  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
2944  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
2945  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
2946  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
2947  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
2948  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
2949  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
2950  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
2951  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
2952  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
2953  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
2954  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
2955  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
2956  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
2957  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
2958  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
2959  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
2960  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
2961  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
2962  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
2963  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
2964  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
2965  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
2966  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
2967  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
2968  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
2969  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
2970  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
2971  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
2972  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
2973  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
2974  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
2975  }
2976 }
2977 
2979  unsigned OpNo) const {
2981  const MCInstrDesc &Desc = get(MI.getOpcode());
2982  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
2983  Desc.OpInfo[OpNo].RegClass == -1) {
2984  unsigned Reg = MI.getOperand(OpNo).getReg();
2985 
2987  return MRI.getRegClass(Reg);
2988  return RI.getPhysRegClass(Reg);
2989  }
2990 
2991  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
2992  return RI.getRegClass(RCID);
2993 }
2994 
2995 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
2996  switch (MI.getOpcode()) {
2997  case AMDGPU::COPY:
2998  case AMDGPU::REG_SEQUENCE:
2999  case AMDGPU::PHI:
3000  case AMDGPU::INSERT_SUBREG:
3001  return RI.hasVGPRs(getOpRegClass(MI, 0));
3002  default:
3003  return RI.hasVGPRs(getOpRegClass(MI, OpNo));
3004  }
3005 }
3006 
3007 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
3009  MachineBasicBlock *MBB = MI.getParent();
3010  MachineOperand &MO = MI.getOperand(OpIdx);
3012  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
3013  const TargetRegisterClass *RC = RI.getRegClass(RCID);
3014  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
3015  if (MO.isReg())
3016  Opcode = AMDGPU::COPY;
3017  else if (RI.isSGPRClass(RC))
3018  Opcode = AMDGPU::S_MOV_B32;
3019 
3020  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
3021  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
3022  VRC = &AMDGPU::VReg_64RegClass;
3023  else
3024  VRC = &AMDGPU::VGPR_32RegClass;
3025 
3026  unsigned Reg = MRI.createVirtualRegister(VRC);
3027  DebugLoc DL = MBB->findDebugLoc(I);
3028  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
3029  MO.ChangeToRegister(Reg, false);
3030 }
3031 
3034  MachineOperand &SuperReg,
3035  const TargetRegisterClass *SuperRC,
3036  unsigned SubIdx,
3037  const TargetRegisterClass *SubRC)
3038  const {
3039  MachineBasicBlock *MBB = MI->getParent();
3040  DebugLoc DL = MI->getDebugLoc();
3041  unsigned SubReg = MRI.createVirtualRegister(SubRC);
3042 
3043  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
3044  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3045  .addReg(SuperReg.getReg(), 0, SubIdx);
3046  return SubReg;
3047  }
3048 
3049  // Just in case the super register is itself a sub-register, copy it to a new
3050  // value so we don't need to worry about merging its subreg index with the
3051  // SubIdx passed to this function. The register coalescer should be able to
3052  // eliminate this extra copy.
3053  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
3054 
3055  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
3056  .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
3057 
3058  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3059  .addReg(NewSuperReg, 0, SubIdx);
3060 
3061  return SubReg;
3062 }
3063 
3067  MachineOperand &Op,
3068  const TargetRegisterClass *SuperRC,
3069  unsigned SubIdx,
3070  const TargetRegisterClass *SubRC) const {
3071  if (Op.isImm()) {
3072  if (SubIdx == AMDGPU::sub0)
3073  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
3074  if (SubIdx == AMDGPU::sub1)
3075  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
3076 
3077  llvm_unreachable("Unhandled register index for immediate");
3078  }
3079 
3080  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
3081  SubIdx, SubRC);
3082  return MachineOperand::CreateReg(SubReg, false);
3083 }
3084 
3085 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
3086 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
3087  assert(Inst.getNumExplicitOperands() == 3);
3088  MachineOperand Op1 = Inst.getOperand(1);
3089  Inst.RemoveOperand(1);
3090  Inst.addOperand(Op1);
3091 }
3092 
3094  const MCOperandInfo &OpInfo,
3095  const MachineOperand &MO) const {
3096  if (!MO.isReg())
3097  return false;
3098 
3099  unsigned Reg = MO.getReg();
3100  const TargetRegisterClass *RC =
3102  MRI.getRegClass(Reg) :
3103  RI.getPhysRegClass(Reg);
3104 
3105  const SIRegisterInfo *TRI =
3106  static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3107  RC = TRI->getSubRegClass(RC, MO.getSubReg());
3108 
3109  // In order to be legal, the common sub-class must be equal to the
3110  // class of the current operand. For example:
3111  //
3112  // v_mov_b32 s0 ; Operand defined as vsrc_b32
3113  // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
3114  //
3115  // s_sendmsg 0, s0 ; Operand defined as m0reg
3116  // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
3117 
3118  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
3119 }
3120 
3122  const MCOperandInfo &OpInfo,
3123  const MachineOperand &MO) const {
3124  if (MO.isReg())
3125  return isLegalRegOperand(MRI, OpInfo, MO);
3126 
3127  // Handle non-register types that are treated like immediates.
3128  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
3129  return true;
3130 }
3131 
3132 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
3133  const MachineOperand *MO) const {
3135  const MCInstrDesc &InstDesc = MI.getDesc();
3136  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
3137  const TargetRegisterClass *DefinedRC =
3138  OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
3139  if (!MO)
3140  MO = &MI.getOperand(OpIdx);
3141 
3142  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
3143 
3144  RegSubRegPair SGPRUsed;
3145  if (MO->isReg())
3146  SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
3147 
3148  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3149  if (i == OpIdx)
3150  continue;
3151  const MachineOperand &Op = MI.getOperand(i);
3152  if (Op.isReg()) {
3153  if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
3154  usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
3155  return false;
3156  }
3157  } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
3158  return false;
3159  }
3160  }
3161  }
3162 
3163  if (MO->isReg()) {
3164  assert(DefinedRC);
3165  return isLegalRegOperand(MRI, OpInfo, *MO);
3166  }
3167 
3168  // Handle non-register types that are treated like immediates.
3169  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
3170 
3171  if (!DefinedRC) {
3172  // This operand expects an immediate.
3173  return true;
3174  }
3175 
3176  return isImmOperandLegal(MI, OpIdx, *MO);
3177 }
3178 
3180  MachineInstr &MI) const {
3181  unsigned Opc = MI.getOpcode();
3182  const MCInstrDesc &InstrDesc = get(Opc);
3183 
3184  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3185  MachineOperand &Src1 = MI.getOperand(Src1Idx);
3186 
3187  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
3188  // we need to only have one constant bus use.
3189  //
3190  // Note we do not need to worry about literal constants here. They are
3191  // disabled for the operand type for instructions because they will always
3192  // violate the one constant bus use rule.
3193  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3194  if (HasImplicitSGPR) {
3195  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3196  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3197 
3198  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
3199  legalizeOpWithMove(MI, Src0Idx);
3200  }
3201 
3202  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
3203  // both the value to write (src0) and lane select (src1). Fix up non-SGPR
3204  // src0/src1 with V_READFIRSTLANE.
3205  if (Opc == AMDGPU::V_WRITELANE_B32) {
3206  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3207  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3208  const DebugLoc &DL = MI.getDebugLoc();
3209  if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
3210  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3211  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3212  .add(Src0);
3213  Src0.ChangeToRegister(Reg, false);
3214  }
3215  if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
3216  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3217  const DebugLoc &DL = MI.getDebugLoc();
3218  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3219  .add(Src1);
3220  Src1.ChangeToRegister(Reg, false);
3221  }
3222  return;
3223  }
3224 
3225  // VOP2 src0 instructions support all operand types, so we don't need to check
3226  // their legality. If src1 is already legal, we don't need to do anything.
3227  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3228  return;
3229 
3230  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3231  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3232  // select is uniform.
3233  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
3234  RI.isVGPR(MRI, Src1.getReg())) {
3235  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3236  const DebugLoc &DL = MI.getDebugLoc();
3237  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3238  .add(Src1);
3239  Src1.ChangeToRegister(Reg, false);
3240  return;
3241  }
3242 
3243  // We do not use commuteInstruction here because it is too aggressive and will
3244  // commute if it is possible. We only want to commute here if it improves
3245  // legality. This can be called a fairly large number of times so don't waste
3246  // compile time pointlessly swapping and checking legality again.
3247  if (HasImplicitSGPR || !MI.isCommutable()) {
3248  legalizeOpWithMove(MI, Src1Idx);
3249  return;
3250  }
3251 
3252  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3253  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3254 
3255  // If src0 can be used as src1, commuting will make the operands legal.
3256  // Otherwise we have to give up and insert a move.
3257  //
3258  // TODO: Other immediate-like operand kinds could be commuted if there was a
3259  // MachineOperand::ChangeTo* for them.
3260  if ((!Src1.isImm() && !Src1.isReg()) ||
3261  !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
3262  legalizeOpWithMove(MI, Src1Idx);
3263  return;
3264  }
3265 
3266  int CommutedOpc = commuteOpcode(MI);
3267  if (CommutedOpc == -1) {
3268  legalizeOpWithMove(MI, Src1Idx);
3269  return;
3270  }
3271 
3272  MI.setDesc(get(CommutedOpc));
3273 
3274  unsigned Src0Reg = Src0.getReg();
3275  unsigned Src0SubReg = Src0.getSubReg();
3276  bool Src0Kill = Src0.isKill();
3277 
3278  if (Src1.isImm())
3279  Src0.ChangeToImmediate(Src1.getImm());
3280  else if (Src1.isReg()) {
3281  Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
3282  Src0.setSubReg(Src1.getSubReg());
3283  } else
3284  llvm_unreachable("Should only have register or immediate operands");
3285 
3286  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3287  Src1.setSubReg(Src0SubReg);
3288 }
3289 
3290 // Legalize VOP3 operands. Because all operand types are supported for any
3291 // operand, and since literal constants are not allowed and should never be
3292 // seen, we only need to worry about inserting copies if we use multiple SGPR
3293 // operands.
3295  MachineInstr &MI) const {
3296  unsigned Opc = MI.getOpcode();
3297 
3298  int VOP3Idx[3] = {
3299  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3300  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3301  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3302  };
3303 
3304  // Find the one SGPR operand we are allowed to use.
3305  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3306 
3307  for (unsigned i = 0; i < 3; ++i) {
3308  int Idx = VOP3Idx[i];
3309  if (Idx == -1)
3310  break;
3311  MachineOperand &MO = MI.getOperand(Idx);
3312 
3313  // We should never see a VOP3 instruction with an illegal immediate operand.
3314  if (!MO.isReg())
3315  continue;
3316 
3317  if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
3318  continue; // VGPRs are legal
3319 
3320  if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
3321  SGPRReg = MO.getReg();
3322  // We can use one SGPR in each VOP3 instruction.
3323  continue;
3324  }
3325 
3326  // If we make it this far, then the operand is not legal and we must
3327  // legalize it.
3328  legalizeOpWithMove(MI, Idx);
3329  }
3330 }
3331 
3333  MachineRegisterInfo &MRI) const {
3334  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3335  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3336  unsigned DstReg = MRI.createVirtualRegister(SRC);
3337  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3338 
3339  if (SubRegs == 1) {
3340  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3341  get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
3342  .addReg(SrcReg);
3343  return DstReg;
3344  }
3345 
3347  for (unsigned i = 0; i < SubRegs; ++i) {
3348  unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3349  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3350  get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3351  .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3352  SRegs.push_back(SGPR);
3353  }
3354 
3355  MachineInstrBuilder MIB =
3356  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3357  get(AMDGPU::REG_SEQUENCE), DstReg);
3358  for (unsigned i = 0; i < SubRegs; ++i) {
3359  MIB.addReg(SRegs[i]);
3360  MIB.addImm(RI.getSubRegFromChannel(i));
3361  }
3362  return DstReg;
3363 }
3364 
3366  MachineInstr &MI) const {
3367 
3368  // If the pointer is store in VGPRs, then we need to move them to
3369  // SGPRs using v_readfirstlane. This is safe because we only select
3370  // loads with uniform pointers to SMRD instruction so we know the
3371  // pointer value is uniform.
3372  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3373  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3374  unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3375  SBase->setReg(SGPR);
3376  }
3377 }
3378 
3381  const TargetRegisterClass *DstRC,
3382  MachineOperand &Op,
3384  const DebugLoc &DL) const {
3385  unsigned OpReg = Op.getReg();
3386  unsigned OpSubReg = Op.getSubReg();
3387 
3388  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3389  RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3390 
3391  // Check if operand is already the correct register class.
3392  if (DstRC == OpRC)
3393  return;
3394 
3395  unsigned DstReg = MRI.createVirtualRegister(DstRC);
3396  MachineInstr *Copy =
3397  BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3398 
3399  Op.setReg(DstReg);
3400  Op.setSubReg(0);
3401 
3402  MachineInstr *Def = MRI.getVRegDef(OpReg);
3403  if (!Def)
3404  return;
3405 
3406  // Try to eliminate the copy if it is copying an immediate value.
3407  if (Def->isMoveImmediate())
3408  FoldImmediate(*Copy, *Def, OpReg, &MRI);
3409 }
3410 
3412  MachineFunction &MF = *MI.getParent()->getParent();
3414 
3415  // Legalize VOP2
3416  if (isVOP2(MI) || isVOPC(MI)) {
3417  legalizeOperandsVOP2(MRI, MI);
3418  return;
3419  }
3420 
3421  // Legalize VOP3
3422  if (isVOP3(MI)) {
3423  legalizeOperandsVOP3(MRI, MI);
3424  return;
3425  }
3426 
3427  // Legalize SMRD
3428  if (isSMRD(MI)) {
3429  legalizeOperandsSMRD(MRI, MI);
3430  return;
3431  }
3432 
3433  // Legalize REG_SEQUENCE and PHI
3434  // The register class of the operands much be the same type as the register
3435  // class of the output.
3436  if (MI.getOpcode() == AMDGPU::PHI) {
3437  const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3438  for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
3439  if (!MI.getOperand(i).isReg() ||
3441  continue;
3442  const TargetRegisterClass *OpRC =
3443  MRI.getRegClass(MI.getOperand(i).getReg());
3444  if (RI.hasVGPRs(OpRC)) {
3445  VRC = OpRC;
3446  } else {
3447  SRC = OpRC;
3448  }
3449  }
3450 
3451  // If any of the operands are VGPR registers, then they all most be
3452  // otherwise we will create illegal VGPR->SGPR copies when legalizing
3453  // them.
3454  if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
3455  if (!VRC) {
3456  assert(SRC);
3457  VRC = RI.getEquivalentVGPRClass(SRC);
3458  }
3459  RC = VRC;
3460  } else {
3461  RC = SRC;
3462  }
3463 
3464  // Update all the operands so they have the same type.
3465  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3466  MachineOperand &Op = MI.getOperand(I);
3468  continue;
3469 
3470  // MI is a PHI instruction.
3471  MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3472  MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3473 
3474  // Avoid creating no-op copies with the same src and dst reg class. These
3475  // confuse some of the machine passes.
3476  legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
3477  }
3478  }
3479 
3480  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
3481  // VGPR dest type and SGPR sources, insert copies so all operands are
3482  // VGPRs. This seems to help operand folding / the register coalescer.
3483  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
3484  MachineBasicBlock *MBB = MI.getParent();
3485  const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
3486  if (RI.hasVGPRs(DstRC)) {
3487  // Update all the operands so they are VGPR register classes. These may
3488  // not be the same register class because REG_SEQUENCE supports mixing
3489  // subregister index types e.g. sub0_sub1 + sub2 + sub3
3490  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3491  MachineOperand &Op = MI.getOperand(I);
3493  continue;
3494 
3495  const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
3496  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
3497  if (VRC == OpRC)
3498  continue;
3499 
3500  legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
3501  Op.setIsKill();
3502  }
3503  }
3504 
3505  return;
3506  }
3507 
3508  // Legalize INSERT_SUBREG
3509  // src0 must have the same register class as dst
3510  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
3511  unsigned Dst = MI.getOperand(0).getReg();
3512  unsigned Src0 = MI.getOperand(1).getReg();
3513  const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
3514  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
3515  if (DstRC != Src0RC) {
3516  MachineBasicBlock *MBB = MI.getParent();
3517  MachineOperand &Op = MI.getOperand(1);
3518  legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
3519  }
3520  return;
3521  }
3522 
3523  // Legalize SI_INIT_M0
3524  if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
3525  MachineOperand &Src = MI.getOperand(0);
3526  if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
3527  Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
3528  return;
3529  }
3530 
3531  // Legalize MIMG and MUBUF/MTBUF for shaders.
3532  //
3533  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
3534  // scratch memory access. In both cases, the legalization never involves
3535  // conversion to the addr64 form.
3536  if (isMIMG(MI) ||
3538  (isMUBUF(MI) || isMTBUF(MI)))) {
3539  MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
3540  if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
3541  unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
3542  SRsrc->setReg(SGPR);
3543  }
3544 
3545  MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
3546  if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
3547  unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
3548  SSamp->setReg(SGPR);
3549  }
3550  return;
3551  }
3552 
3553  // Legalize MUBUF* instructions by converting to addr64 form.
3554  // FIXME: If we start using the non-addr64 instructions for compute, we
3555  // may need to legalize them as above. This especially applies to the
3556  // buffer_load_format_* variants and variants with idxen (or bothen).
3557  int SRsrcIdx =
3558  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
3559  if (SRsrcIdx != -1) {
3560  // We have an MUBUF instruction
3561  MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
3562  unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
3563  if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
3564  RI.getRegClass(SRsrcRC))) {
3565  // The operands are legal.
3566  // FIXME: We may need to legalize operands besided srsrc.
3567  return;
3568  }
3569 
3570  MachineBasicBlock &MBB = *MI.getParent();
3571 
3572  // Extract the ptr from the resource descriptor.
3573  unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
3574  &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3575 
3576  // Create an empty resource descriptor
3577  unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3578  unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3579  unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3580  unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3581  uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
3582 
3583  // Zero64 = 0
3584  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
3585  .addImm(0);
3586 
3587  // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3588  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3589  .addImm(RsrcDataFormat & 0xFFFFFFFF);
3590 
3591  // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3592  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3593  .addImm(RsrcDataFormat >> 32);
3594 
3595  // NewSRsrc = {Zero64, SRsrcFormat}
3596  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3597  .addReg(Zero64)
3598  .addImm(AMDGPU::sub0_sub1)
3599  .addReg(SRsrcFormatLo)
3600  .addImm(AMDGPU::sub2)
3601  .addReg(SRsrcFormatHi)
3602  .addImm(AMDGPU::sub3);
3603 
3604  MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
3605  unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3606  if (VAddr) {
3607  // This is already an ADDR64 instruction so we need to add the pointer
3608  // extracted from the resource descriptor to the current value of VAddr.
3609  unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3610  unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3611 
3612  // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
3613  DebugLoc DL = MI.getDebugLoc();
3614  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
3615  .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3616  .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
3617 
3618  // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
3619  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
3620  .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3621  .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
3622 
3623  // NewVaddr = {NewVaddrHi, NewVaddrLo}
3624  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
3625  .addReg(NewVAddrLo)
3626  .addImm(AMDGPU::sub0)
3627  .addReg(NewVAddrHi)
3628  .addImm(AMDGPU::sub1);
3629  } else {
3630  // This instructions is the _OFFSET variant, so we need to convert it to
3631  // ADDR64.
3632  assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
3634  "FIXME: Need to emit flat atomics here");
3635 
3636  MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
3637  MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3638  MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
3639  unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
3640 
3641  // Atomics rith return have have an additional tied operand and are
3642  // missing some of the special bits.
3643  MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
3644  MachineInstr *Addr64;
3645 
3646  if (!VDataIn) {
3647  // Regular buffer load / store.
3648  MachineInstrBuilder MIB =
3649  BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3650  .add(*VData)
3651  .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3652  // This will be replaced later
3653  // with the new value of vaddr.
3654  .add(*SRsrc)
3655  .add(*SOffset)
3656  .add(*Offset);
3657 
3658  // Atomics do not have this operand.
3659  if (const MachineOperand *GLC =
3660  getNamedOperand(MI, AMDGPU::OpName::glc)) {
3661  MIB.addImm(GLC->getImm());
3662  }
3663 
3664  MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
3665 
3666  if (const MachineOperand *TFE =
3667  getNamedOperand(MI, AMDGPU::OpName::tfe)) {
3668  MIB.addImm(TFE->getImm());
3669  }
3670 
3672  Addr64 = MIB;
3673  } else {
3674  // Atomics with return.
3675  Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3676  .add(*VData)
3677  .add(*VDataIn)
3678  .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3679  // This will be replaced later
3680  // with the new value of vaddr.
3681  .add(*SRsrc)
3682  .add(*SOffset)
3683  .add(*Offset)
3684  .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
3686  }
3687 
3688  MI.removeFromParent();
3689 
3690  // NewVaddr = {NewVaddrHi, NewVaddrLo}
3691  BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
3692  NewVAddr)
3693  .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3694  .addImm(AMDGPU::sub0)
3695  .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3696  .addImm(AMDGPU::sub1);
3697 
3698  VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
3699  SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
3700  }
3701 
3702  // Update the instruction to use NewVaddr
3703  VAddr->setReg(NewVAddr);
3704  // Update the instruction to use NewSRsrc
3705  SRsrc->setReg(NewSRsrc);
3706  }
3707 }
3708 
3710  SetVectorType Worklist;
3711  Worklist.insert(&TopInst);
3712 
3713  while (!Worklist.empty()) {
3714  MachineInstr &Inst = *Worklist.pop_back_val();
3715  MachineBasicBlock *MBB = Inst.getParent();
3717 
3718  unsigned Opcode = Inst.getOpcode();
3719  unsigned NewOpcode = getVALUOp(Inst);
3720 
3721  // Handle some special cases
3722  switch (Opcode) {
3723  default:
3724  break;
3725  case AMDGPU::S_ADD_U64_PSEUDO:
3726  case AMDGPU::S_SUB_U64_PSEUDO:
3727  splitScalar64BitAddSub(Worklist, Inst);
3728  Inst.eraseFromParent();
3729  continue;
3730  case AMDGPU::S_ADD_I32:
3731  case AMDGPU::S_SUB_I32:
3732  // FIXME: The u32 versions currently selected use the carry.
3733  if (moveScalarAddSub(Worklist, Inst))
3734  continue;
3735 
3736  // Default handling
3737  break;
3738  case AMDGPU::S_AND_B64:
3739  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
3740  Inst.eraseFromParent();
3741  continue;
3742 
3743  case AMDGPU::S_OR_B64:
3744  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
3745  Inst.eraseFromParent();
3746  continue;
3747 
3748  case AMDGPU::S_XOR_B64:
3749  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
3750  Inst.eraseFromParent();
3751  continue;
3752 
3753  case AMDGPU::S_NOT_B64:
3754  splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
3755  Inst.eraseFromParent();
3756  continue;
3757 
3758  case AMDGPU::S_BCNT1_I32_B64:
3759  splitScalar64BitBCNT(Worklist, Inst);
3760  Inst.eraseFromParent();
3761  continue;
3762 
3763  case AMDGPU::S_BFE_I64:
3764  splitScalar64BitBFE(Worklist, Inst);
3765  Inst.eraseFromParent();
3766  continue;
3767 
3768  case AMDGPU::S_LSHL_B32:
3769  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3770  NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
3771  swapOperands(Inst);
3772  }
3773  break;
3774  case AMDGPU::S_ASHR_I32:
3775  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3776  NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
3777  swapOperands(Inst);
3778  }
3779  break;
3780  case AMDGPU::S_LSHR_B32:
3781  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3782  NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
3783  swapOperands(Inst);
3784  }
3785  break;
3786  case AMDGPU::S_LSHL_B64:
3787  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3788  NewOpcode = AMDGPU::V_LSHLREV_B64;
3789  swapOperands(Inst);
3790  }
3791  break;
3792  case AMDGPU::S_ASHR_I64:
3793  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3794  NewOpcode = AMDGPU::V_ASHRREV_I64;
3795  swapOperands(Inst);
3796  }
3797  break;
3798  case AMDGPU::S_LSHR_B64:
3799  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3800  NewOpcode = AMDGPU::V_LSHRREV_B64;
3801  swapOperands(Inst);
3802  }
3803  break;
3804 
3805  case AMDGPU::S_ABS_I32:
3806  lowerScalarAbs(Worklist, Inst);
3807  Inst.eraseFromParent();
3808  continue;
3809 
3810  case AMDGPU::S_CBRANCH_SCC0:
3811  case AMDGPU::S_CBRANCH_SCC1:
3812  // Clear unused bits of vcc
3813  BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
3814  AMDGPU::VCC)
3815  .addReg(AMDGPU::EXEC)
3816  .addReg(AMDGPU::VCC);
3817  break;
3818 
3819  case AMDGPU::S_BFE_U64:
3820  case AMDGPU::S_BFM_B64:
3821  llvm_unreachable("Moving this op to VALU not implemented");
3822 
3823  case AMDGPU::S_PACK_LL_B32_B16:
3824  case AMDGPU::S_PACK_LH_B32_B16:
3825  case AMDGPU::S_PACK_HH_B32_B16:
3826  movePackToVALU(Worklist, MRI, Inst);
3827  Inst.eraseFromParent();
3828  continue;
3829 
3830  case AMDGPU::S_XNOR_B32:
3831  lowerScalarXnor(Worklist, Inst);
3832  Inst.eraseFromParent();
3833  continue;
3834 
3835  case AMDGPU::S_XNOR_B64:
3836  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32);
3837  Inst.eraseFromParent();
3838  continue;
3839 
3840  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: {
3841  unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3842  const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
3843  auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
3844  unsigned Offset = 0;
3845 
3846  // FIXME: This isn't safe because the addressing mode doesn't work
3847  // correctly if vaddr is negative.
3848  //
3849  // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
3850  //
3851  // See if we can extract an immediate offset by recognizing one of these:
3852  // V_ADD_I32_e32 dst, imm, src1
3853  // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
3854  // V_ADD will be removed by "Remove dead machine instructions".
3855  if (Add &&
3856  (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
3857  Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
3858  static const unsigned SrcNames[2] = {
3859  AMDGPU::OpName::src0,
3860  AMDGPU::OpName::src1,
3861  };
3862 
3863  // Find a literal offset in one of source operands.
3864  for (int i = 0; i < 2; i++) {
3865  const MachineOperand *Src =
3866  getNamedOperand(*Add, SrcNames[i]);
3867 
3868  if (Src->isReg()) {
3869  auto Mov = MRI.getUniqueVRegDef(Src->getReg());
3870  if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
3871  Src = &Mov->getOperand(1);
3872  }
3873 
3874  if (Src) {
3875  if (Src->isImm())
3876  Offset = Src->getImm();
3877  else if (Src->isCImm())
3878  Offset = Src->getCImm()->getZExtValue();
3879  }
3880 
3881  if (Offset && isLegalMUBUFImmOffset(Offset)) {
3882  VAddr = getNamedOperand(*Add, SrcNames[!i]);
3883  break;
3884  }
3885 
3886  Offset = 0;
3887  }
3888  }
3889 
3890  MachineInstr *NewInstr =
3891  BuildMI(*MBB, Inst, Inst.getDebugLoc(),
3892  get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
3893  .add(*VAddr) // vaddr
3894  .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
3895  .addImm(0) // soffset
3896  .addImm(Offset) // offset
3897  .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
3898  .addImm(0) // slc
3899  .addImm(0) // tfe
3901  .getInstr();
3902 
3903  MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
3904  VDst);
3905  addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
3906  Inst.eraseFromParent();
3907 
3908  // Legalize all operands other than the offset. Notably, convert the srsrc
3909  // into SGPRs using v_readfirstlane if needed.
3910  legalizeOperands(*NewInstr);
3911  continue;
3912  }
3913  }
3914 
3915  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
3916  // We cannot move this instruction to the VALU, so we should try to
3917  // legalize its operands instead.
3918  legalizeOperands(Inst);
3919  continue;
3920  }
3921 
3922  // Use the new VALU Opcode.
3923  const MCInstrDesc &NewDesc = get(NewOpcode);
3924  Inst.setDesc(NewDesc);
3925 
3926  // Remove any references to SCC. Vector instructions can't read from it, and
3927  // We're just about to add the implicit use / defs of VCC, and we don't want
3928  // both.
3929  for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
3930  MachineOperand &Op = Inst.getOperand(i);
3931  if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
3932  Inst.RemoveOperand(i);
3933  addSCCDefUsersToVALUWorklist(Inst, Worklist);
3934  }
3935  }
3936 
3937  if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
3938  // We are converting these to a BFE, so we need to add the missing
3939  // operands for the size and offset.
3940  unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
3943 
3944  } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
3945  // The VALU version adds the second operand to the result, so insert an
3946  // extra 0 operand.
3948  }
3949 
3951 
3952  if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
3953  const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
3954  // If we need to move this to VGPRs, we need to unpack the second operand
3955  // back into the 2 separate ones for bit offset and width.
3956  assert(OffsetWidthOp.isImm() &&
3957  "Scalar BFE is only implemented for constant width and offset");
3958  uint32_t Imm = OffsetWidthOp.getImm();
3959 
3960  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
3961  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
3962  Inst.RemoveOperand(2); // Remove old immediate.
3963  Inst.addOperand(MachineOperand::CreateImm(Offset));
3964  Inst.addOperand(MachineOperand::CreateImm(BitWidth));
3965  }
3966 
3967  bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
3968  unsigned NewDstReg = AMDGPU::NoRegister;
3969  if (HasDst) {
3970  unsigned DstReg = Inst.getOperand(0).getReg();
3972  continue;
3973 
3974  // Update the destination register class.
3975  const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
3976  if (!NewDstRC)
3977  continue;
3978 
3979  if (Inst.isCopy() &&
3981  NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
3982  // Instead of creating a copy where src and dst are the same register
3983  // class, we just replace all uses of dst with src. These kinds of
3984  // copies interfere with the heuristics MachineSink uses to decide
3985  // whether or not to split a critical edge. Since the pass assumes
3986  // that copies will end up as machine instructions and not be
3987  // eliminated.
3988  addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
3989  MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
3990  MRI.clearKillFlags(Inst.getOperand(1).getReg());
3991  Inst.getOperand(0).setReg(DstReg);
3992 
3993  // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
3994  // these are deleted later, but at -O0 it would leave a suspicious
3995  // looking illegal copy of an undef register.
3996  for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
3997  Inst.RemoveOperand(I);
3998  Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
3999  continue;
4000  }
4001 
4002  NewDstReg = MRI.createVirtualRegister(NewDstRC);
4003  MRI.replaceRegWith(DstReg, NewDstReg);
4004  }
4005 
4006  // Legalize the operands
4007  legalizeOperands(Inst);
4008 
4009  if (HasDst)
4010  addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
4011  }
4012 }
4013 
4014 // Add/sub require special handling to deal with carry outs.
4015 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
4016  MachineInstr &Inst) const {
4017  if (ST.hasAddNoCarry()) {
4018  // Assume there is no user of scc since we don't select this in that case.
4019  // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
4020  // is used.
4021 
4022  MachineBasicBlock &MBB = *Inst.getParent();
4024 
4025  unsigned OldDstReg = Inst.getOperand(0).getReg();
4026  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4027 
4028  unsigned Opc = Inst.getOpcode();
4029  assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
4030 
4031  unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
4032  AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
4033 
4034  assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
4035  Inst.RemoveOperand(3);
4036 
4037  Inst.setDesc(get(NewOpc));
4038  Inst.addImplicitDefUseOperands(*MBB.getParent());
4039  MRI.replaceRegWith(OldDstReg, ResultReg);
4040  legalizeOperands(Inst);
4041 
4042  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4043  return true;
4044  }
4045 
4046  return false;
4047 }
4048 
4049 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
4050  MachineInstr &Inst) const {
4051  MachineBasicBlock &MBB = *Inst.getParent();
4053  MachineBasicBlock::iterator MII = Inst;
4054  DebugLoc DL = Inst.getDebugLoc();
4055 
4056  MachineOperand &Dest = Inst.getOperand(0);
4057  MachineOperand &Src = Inst.getOperand(1);
4058  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4059  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4060 
4061  unsigned SubOp = ST.hasAddNoCarry() ?
4062  AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
4063 
4064  BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
4065  .addImm(0)
4066  .addReg(Src.getReg());
4067 
4068  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
4069  .addReg(Src.getReg())
4070  .addReg(TmpReg);
4071 
4072  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4073  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4074 }
4075 
4076 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
4077  MachineInstr &Inst) const {
4078  MachineBasicBlock &MBB = *Inst.getParent();
4080  MachineBasicBlock::iterator MII = Inst;
4081  const DebugLoc &DL = Inst.getDebugLoc();
4082 
4083  MachineOperand &Dest = Inst.getOperand(0);
4084  MachineOperand &Src0 = Inst.getOperand(1);
4085  MachineOperand &Src1 = Inst.getOperand(2);
4086 
4087  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
4088  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
4089 
4090  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4091  if (ST.hasDLInsts()) {
4092  BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
4093  .add(Src0)
4094  .add(Src1);
4095  } else {
4096  unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4097  BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
4098  .add(Src0)
4099  .add(Src1);
4100 
4101  BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
4102  .addReg(Xor);
4103  }
4104 
4105  MRI.replaceRegWith(Dest.getReg(), NewDest);
4106  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4107 }
4108 
4109 void SIInstrInfo::splitScalar64BitUnaryOp(
4110  SetVectorType &Worklist, MachineInstr &Inst,
4111  unsigned Opcode) const {
4112  MachineBasicBlock &MBB = *Inst.getParent();
4114 
4115  MachineOperand &Dest = Inst.getOperand(0);
4116  MachineOperand &Src0 = Inst.getOperand(1);
4117  DebugLoc DL = Inst.getDebugLoc();
4118 
4119  MachineBasicBlock::iterator MII = Inst;
4120 
4121  const MCInstrDesc &InstDesc = get(Opcode);
4122  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4123  MRI.getRegClass(Src0.getReg()) :
4124  &AMDGPU::SGPR_32RegClass;
4125 
4126  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4127 
4128  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4129  AMDGPU::sub0, Src0SubRC);
4130 
4131  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4132  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4133  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4134 
4135  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4136  BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
4137 
4138  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4139  AMDGPU::sub1, Src0SubRC);
4140 
4141  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4142  BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
4143 
4144  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4145  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4146  .addReg(DestSub0)
4147  .addImm(AMDGPU::sub0)
4148  .addReg(DestSub1)
4149  .addImm(AMDGPU::sub1);
4150 
4151  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4152 
4153  // We don't need to legalizeOperands here because for a single operand, src0
4154  // will support any kind of input.
4155 
4156  // Move all users of this moved value.
4157  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4158 }
4159 
4160 void SIInstrInfo::splitScalar64BitAddSub(
4161  SetVectorType &Worklist, MachineInstr &Inst) const {
4162  bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4163 
4164  MachineBasicBlock &MBB = *Inst.getParent();
4166 
4167  unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4168  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4169  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4170 
4171  unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4172  unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4173 
4174  MachineOperand &Dest = Inst.getOperand(0);
4175  MachineOperand &Src0 = Inst.getOperand(1);
4176  MachineOperand &Src1 = Inst.getOperand(2);
4177  const DebugLoc &DL = Inst.getDebugLoc();
4178  MachineBasicBlock::iterator MII = Inst;
4179 
4180  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
4181  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
4182  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4183  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4184 
4185  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4186  AMDGPU::sub0, Src0SubRC);
4187  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4188  AMDGPU::sub0, Src1SubRC);
4189 
4190 
4191  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4192  AMDGPU::sub1, Src0SubRC);
4193  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4194  AMDGPU::sub1, Src1SubRC);
4195 
4196  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
4197  MachineInstr *LoHalf =
4198  BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
4199  .addReg(CarryReg, RegState::Define)
4200  .add(SrcReg0Sub0)
4201  .add(SrcReg1Sub0);
4202 
4203  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4204  MachineInstr *HiHalf =
4205  BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
4206  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4207  .add(SrcReg0Sub1)
4208  .add(SrcReg1Sub1)
4209  .addReg(CarryReg, RegState::Kill);
4210 
4211  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4212  .addReg(DestSub0)
4213  .addImm(AMDGPU::sub0)
4214  .addReg(DestSub1)
4215  .addImm(AMDGPU::sub1);
4216 
4217  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4218 
4219  // Try to legalize the operands in case we need to swap the order to keep it
4220  // valid.
4221  legalizeOperands(*LoHalf);
4222  legalizeOperands(*HiHalf);
4223 
4224  // Move all users of this moved vlaue.
4225  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4226 }
4227 
4228 void SIInstrInfo::splitScalar64BitBinaryOp(
4229  SetVectorType &Worklist, MachineInstr &Inst,
4230  unsigned Opcode) const {
4231  MachineBasicBlock &MBB = *Inst.getParent();
4233 
4234  MachineOperand &Dest = Inst.getOperand(0);
4235  MachineOperand &Src0 = Inst.getOperand(1);
4236  MachineOperand &Src1 = Inst.getOperand(2);
4237  DebugLoc DL = Inst.getDebugLoc();
4238 
4239  MachineBasicBlock::iterator MII = Inst;
4240 
4241  const MCInstrDesc &InstDesc = get(Opcode);
4242  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4243  MRI.getRegClass(Src0.getReg()) :
4244  &AMDGPU::SGPR_32RegClass;
4245 
4246  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4247  const TargetRegisterClass *Src1RC = Src1.isReg() ?
4248  MRI.getRegClass(Src1.getReg()) :
4249  &AMDGPU::SGPR_32RegClass;
4250 
4251  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4252 
4253  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4254  AMDGPU::sub0, Src0SubRC);
4255  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4256  AMDGPU::sub0, Src1SubRC);
4257 
4258  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4259  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4260  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4261 
4262  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4263  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
4264  .add(SrcReg0Sub0)
4265  .add(SrcReg1Sub0);
4266 
4267  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4268  AMDGPU::sub1, Src0SubRC);
4269  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4270  AMDGPU::sub1, Src1SubRC);
4271 
4272  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4273  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
4274  .add(SrcReg0Sub1)
4275  .add(SrcReg1Sub1);
4276 
4277  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4278  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4279  .addReg(DestSub0)
4280  .addImm(AMDGPU::sub0)
4281  .addReg(DestSub1)
4282  .addImm(AMDGPU::sub1);
4283 
4284  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4285 
4286  // Try to legalize the operands in case we need to swap the order to keep it
4287  // valid.
4288  legalizeOperands(LoHalf);
4289  legalizeOperands(HiHalf);
4290 
4291  // Move all users of this moved vlaue.
4292  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4293 }
4294 
4295 void SIInstrInfo::splitScalar64BitBCNT(
4296  SetVectorType &Worklist, MachineInstr &Inst) const {
4297  MachineBasicBlock &MBB = *Inst.getParent();
4299 
4300  MachineBasicBlock::iterator MII = Inst;
4301  DebugLoc DL = Inst.getDebugLoc();
4302 
4303  MachineOperand &Dest = Inst.getOperand(0);
4304  MachineOperand &Src = Inst.getOperand(1);
4305 
4306  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
4307  const TargetRegisterClass *SrcRC = Src.isReg() ?
4308  MRI.getRegClass(Src.getReg()) :
4309  &AMDGPU::SGPR_32RegClass;
4310 
4311  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4312  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4313 
4314  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
4315 
4316  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4317  AMDGPU::sub0, SrcSubRC);
4318  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4319  AMDGPU::sub1, SrcSubRC);
4320 
4321  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
4322 
4323  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
4324 
4325  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4326 
4327  // We don't need to legalize operands here. src0 for etiher instruction can be
4328  // an SGPR, and the second input is unused or determined here.
4329  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4330 }
4331 
4332 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
4333  MachineInstr &Inst) const {
4334  MachineBasicBlock &MBB = *Inst.getParent();
4336  MachineBasicBlock::iterator MII = Inst;
4337  DebugLoc DL = Inst.getDebugLoc();
4338 
4339  MachineOperand &Dest = Inst.getOperand(0);
4340  uint32_t Imm = Inst.getOperand(2).getImm();
4341  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4342  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4343 
4344  (void) Offset;
4345 
4346  // Only sext_inreg cases handled.
4347  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
4348  Offset == 0 && "Not implemented");
4349 
4350  if (BitWidth < 32) {
4351  unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4352  unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4353  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4354 
4355  BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
4356  .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
4357  .addImm(0)
4358  .addImm(BitWidth);
4359 
4360  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
4361  .addImm(31)
4362  .addReg(MidRegLo);
4363 
4364  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4365  .addReg(MidRegLo)
4366  .addImm(AMDGPU::sub0)
4367  .addReg(MidRegHi)
4368  .addImm(AMDGPU::sub1);
4369 
4370  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4371  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4372  return;
4373  }
4374 
4375  MachineOperand &Src = Inst.getOperand(1);
4376  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4377  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4378 
4379  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
4380  .addImm(31)
4381  .addReg(Src.getReg(), 0, AMDGPU::sub0);
4382 
4383  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4384  .addReg(Src.getReg(), 0, AMDGPU::sub0)
4385  .addImm(AMDGPU::sub0)
4386  .addReg(TmpReg)
4387  .addImm(AMDGPU::sub1);
4388 
4389  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4390  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4391 }
4392 
4393 void SIInstrInfo::addUsersToMoveToVALUWorklist(
4394  unsigned DstReg,
4396  SetVectorType &Worklist) const {
4397  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
4398  E = MRI.use_end(); I != E;) {
4399  MachineInstr &UseMI = *I->getParent();
4400  if (!canReadVGPR(UseMI, I.getOperandNo())) {
4401  Worklist.insert(&UseMI);
4402 
4403  do {
4404  ++I;
4405  } while (I != E && I->getParent() == &UseMI);
4406  } else {
4407  ++I;
4408  }
4409  }
4410 }
4411 
4412 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
4413  MachineRegisterInfo &MRI,
4414  MachineInstr &Inst) const {
4415  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4416  MachineBasicBlock *MBB = Inst.getParent();
4417  MachineOperand &Src0 = Inst.getOperand(1);
4418  MachineOperand &Src1 = Inst.getOperand(2);
4419  const DebugLoc &DL = Inst.getDebugLoc();
4420 
4421  switch (Inst.getOpcode()) {
4422  case AMDGPU::S_PACK_LL_B32_B16: {
4423  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4424  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4425 
4426  // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
4427  // 0.
4428  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4429  .addImm(0xffff);
4430 
4431  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
4432  .addReg(ImmReg, RegState::Kill)
4433  .add(Src0);
4434 
4435  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
4436  .add(Src1)
4437  .addImm(16)
4438  .addReg(TmpReg, RegState::Kill);
4439  break;
4440  }
4441  case AMDGPU::S_PACK_LH_B32_B16: {
4442  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4443  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4444  .addImm(0xffff);
4445  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
4446  .addReg(ImmReg, RegState::Kill)
4447  .add(Src0)
4448  .add(Src1);
4449  break;
4450  }
4451  case AMDGPU::S_PACK_HH_B32_B16: {
4452  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4453  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4454  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
4455  .addImm(16)
4456  .add(Src0);
4457  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4458  .addImm(0xffff0000);
4459  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
4460  .add(Src1)
4461  .addReg(ImmReg, RegState::Kill)
4462  .addReg(TmpReg, RegState::Kill);
4463  break;
4464  }
4465  default:
4466  llvm_unreachable("unhandled s_pack_* instruction");
4467  }
4468 
4469  MachineOperand &Dest = Inst.getOperand(0);
4470  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4471  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4472 }
4473 
4474 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
4475  MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
4476  // This assumes that all the users of SCC are in the same block
4477  // as the SCC def.
4478  for (MachineInstr &MI :
4480  SCCDefInst.getParent()->end())) {
4481  // Exit if we find another SCC def.
4482  if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
4483  return;
4484 
4485  if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
4486  Worklist.insert(&MI);
4487  }
4488 }
4489 
4490 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
4491  const MachineInstr &Inst) const {
4492  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
4493 
4494  switch (Inst.getOpcode()) {
4495  // For target instructions, getOpRegClass just returns the virtual register
4496  // class associated with the operand, so we need to find an equivalent VGPR
4497  // register class in order to move the instruction to the VALU.
4498  case AMDGPU::COPY:
4499  case AMDGPU::PHI:
4500  case AMDGPU::REG_SEQUENCE:
4501  case AMDGPU::INSERT_SUBREG:
4502  case AMDGPU::WQM:
4503  case AMDGPU::WWM:
4504  if (RI.hasVGPRs(NewDstRC))
4505  return nullptr;
4506 
4507  NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
4508  if (!NewDstRC)
4509  return nullptr;
4510  return NewDstRC;
4511  default:
4512  return NewDstRC;
4513  }
4514 }
4515 
4516 // Find the one SGPR operand we are allowed to use.
4517 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
4518  int OpIndices[3]) const {
4519  const MCInstrDesc &Desc = MI.getDesc();
4520 
4521  // Find the one SGPR operand we are allowed to use.
4522  //
4523  // First we need to consider the instruction's operand requirements before
4524  // legalizing. Some operands are required to be SGPRs, such as implicit uses
4525  // of VCC, but we are still bound by the constant bus requirement to only use
4526  // one.
4527  //
4528  // If the operand's class is an SGPR, we can never move it.
4529 
4530  unsigned SGPRReg = findImplicitSGPRRead(MI);
4531  if (SGPRReg != AMDGPU::NoRegister)
4532  return SGPRReg;
4533 
4534  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
4535  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4536 
4537  for (unsigned i = 0; i < 3; ++i) {
4538  int Idx = OpIndices[i];
4539  if (Idx == -1)
4540  break;
4541 
4542  const MachineOperand &MO = MI.getOperand(Idx);
4543  if (!MO.isReg())
4544  continue;
4545 
4546  // Is this operand statically required to be an SGPR based on the operand
4547  // constraints?
4548  const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
4549  bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
4550  if (IsRequiredSGPR)
4551  return MO.getReg();
4552 
4553  // If this could be a VGPR or an SGPR, Check the dynamic register class.
4554  unsigned Reg = MO.getReg();
4555  const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
4556  if (RI.isSGPRClass(RegRC))
4557  UsedSGPRs[i] = Reg;
4558  }
4559 
4560  // We don't have a required SGPR operand, so we have a bit more freedom in
4561  // selecting operands to move.
4562 
4563  // Try to select the most used SGPR. If an SGPR is equal to one of the
4564  // others, we choose that.
4565  //
4566  // e.g.
4567  // V_FMA_F32 v0, s0, s0, s0 -> No moves
4568  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
4569 
4570  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
4571  // prefer those.
4572 
4573  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
4574  if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
4575  SGPRReg = UsedSGPRs[0];
4576  }
4577 
4578  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
4579  if (UsedSGPRs[1] == UsedSGPRs[2])
4580  SGPRReg = UsedSGPRs[1];
4581  }
4582 
4583  return SGPRReg;
4584 }
4585 
4587  unsigned OperandName) const {
4588  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
4589  if (Idx == -1)
4590  return nullptr;
4591 
4592  return &MI.getOperand(Idx);
4593 }
4594 
4596  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
4597  if (ST.isAmdHsaOS()) {
4598  // Set ATC = 1. GFX9 doesn't have this bit.
4599  if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
4600  RsrcDataFormat |= (1ULL << 56);
4601 
4602  // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
4603  // BTW, it disables TC L2 and therefore decreases performance.
4604  if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
4605  RsrcDataFormat |= (2ULL << 59);
4606  }
4607 
4608  return RsrcDataFormat;
4609 }
4610 
4612  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
4614  0xffffffff; // Size;
4615 
4616  // GFX9 doesn't have ELEMENT_SIZE.
4617  if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
4618  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
4619  Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
4620  }
4621 
4622  // IndexStride = 64.
4623  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
4624 
4625  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
4626  // Clear them unless we want a huge stride.
4627  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
4628  Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
4629 
4630  return Rsrc23;
4631 }
4632 
4634  unsigned Opc = MI.getOpcode();
4635 
4636  return isSMRD(Opc);
4637 }
4638 
4640  unsigned Opc = MI.getOpcode();
4641 
4642  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
4643 }
4644 
4646  int &FrameIndex) const {
4647  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4648  if (!Addr || !Addr->isFI())
4649  return AMDGPU::NoRegister;
4650 
4651  assert(!MI.memoperands_empty() &&
4653 
4654  FrameIndex = Addr->getIndex();
4655  return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
4656 }
4657 
4659  int &FrameIndex) const {
4660  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
4661  assert(Addr && Addr->isFI());
4662  FrameIndex = Addr->getIndex();
4663  return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
4664 }
4665 
4667  int &FrameIndex) const {
4668  if (!MI.mayLoad())
4669  return AMDGPU::NoRegister;
4670 
4671  if (isMUBUF(MI) || isVGPRSpill(MI))
4672  return isStackAccess(MI, FrameIndex);
4673 
4674  if (isSGPRSpill(MI))
4675  return isSGPRStackAccess(MI, FrameIndex);
4676 
4677  return AMDGPU::NoRegister;
4678 }
4679 
4681  int &FrameIndex) const {
4682  if (!MI.mayStore())
4683  return AMDGPU::NoRegister;
4684 
4685  if (isMUBUF(MI) || isVGPRSpill(MI))
4686  return isStackAccess(MI, FrameIndex);
4687 
4688  if (isSGPRSpill(MI))
4689  return isSGPRStackAccess(MI, FrameIndex);
4690 
4691  return AMDGPU::NoRegister;
4692 }
4693 
4695  unsigned Size = 0;
4698  while (++I != E && I->isInsideBundle()) {
4699  assert(!I->isBundle() && "No nested bundle!");
4700  Size += getInstSizeInBytes(*I);
4701  }
4702 
4703  return Size;
4704 }
4705 
4707  unsigned Opc = MI.getOpcode();
4708  const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
4709  unsigned DescSize = Desc.getSize();
4710 
4711  // If we have a definitive size, we can use it. Otherwise we need to inspect
4712  // the operands to know the size.
4713  //
4714  // FIXME: Instructions that have a base 32-bit encoding report their size as
4715  // 4, even though they are really 8 bytes if they have a literal operand.
4716  if (DescSize != 0 && DescSize != 4)
4717  return DescSize;
4718 
4719  // 4-byte instructions may have a 32-bit literal encoded after them. Check
4720  // operands that coud ever be literals.
4721  if (isVALU(MI) || isSALU(MI)) {
4722  if (isFixedSize(MI))
4723  return DescSize;
4724 
4725  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
4726  if (Src0Idx == -1)
4727  return 4; // No operands.
4728 
4729  if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
4730  return 8;
4731 
4732  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4733  if (Src1Idx == -1)
4734  return 4;
4735 
4736  if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
4737  return 8;
4738 
4739  return 4;
4740  }
4741 
4742  if (DescSize == 4)
4743  return 4;
4744 
4745  switch (Opc) {
4746  case TargetOpcode::IMPLICIT_DEF:
4747  case TargetOpcode::KILL:
4748  case TargetOpcode::DBG_VALUE:
4750  return 0;
4751  case TargetOpcode::BUNDLE:
4752  return getInstBundleSize(MI);
4753  case TargetOpcode::INLINEASM: {
4754  const MachineFunction *MF = MI.getParent()->getParent();
4755  const char *AsmStr = MI.getOperand(0).getSymbolName();
4756  return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
4757  }
4758  default:
4759  llvm_unreachable("unable to find instruction size");
4760  }
4761 }
4762 
4764  if (!isFLAT(MI))
4765  return false;
4766 
4767  if (MI.memoperands_empty())
4768  return true;
4769 
4770  for (const MachineMemOperand *MMO : MI.memoperands()) {
4771  if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
4772  return true;
4773  }
4774  return false;
4775 }
4776 
4778  return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
4779 }
4780 
4782  MachineBasicBlock *IfEnd) const {
4784  assert(TI != IfEntry->end());
4785 
4786  MachineInstr *Branch = &(*TI);
4787  MachineFunction *MF = IfEntry->getParent();
4788  MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
4789 
4790  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
4791  unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4792  MachineInstr *SIIF =
4793  BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
4794  .add(Branch->getOperand(0))
4795  .add(Branch->getOperand(1));
4796  MachineInstr *SIEND =
4797  BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
4798  .addReg(DstReg);
4799 
4800  IfEntry->erase(TI);
4801  IfEntry->insert(IfEntry->end(), SIIF);
4802  IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
4803  }
4804 }
4805 
4807  MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
4809  // We expect 2 terminators, one conditional and one unconditional.
4810  assert(TI != LoopEnd->end());
4811 
4812  MachineInstr *Branch = &(*TI);
4813  MachineFunction *MF = LoopEnd->getParent();
4814  MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
4815 
4816  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
4817 
4818  unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4819  unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4820  MachineInstrBuilder HeaderPHIBuilder =
4821  BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
4822  for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
4823  E = LoopEntry->pred_end();
4824  PI != E; ++PI) {
4825  if (*PI == LoopEnd) {
4826  HeaderPHIBuilder.addReg(BackEdgeReg);
4827  } else {
4828  MachineBasicBlock *PMBB = *PI;
4829  unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4831  ZeroReg, 0);
4832  HeaderPHIBuilder.addReg(ZeroReg);
4833  }
4834  HeaderPHIBuilder.addMBB(*PI);
4835  }
4836  MachineInstr *HeaderPhi = HeaderPHIBuilder;
4837  MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
4838  get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
4839  .addReg(DstReg)
4840  .add(Branch->getOperand(0));
4841  MachineInstr *SILOOP =
4842  BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
4843  .addReg(BackEdgeReg)
4844  .addMBB(LoopEntry);
4845 
4846  LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
4847  LoopEnd->erase(TI);
4848  LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
4849  LoopEnd->insert(LoopEnd->end(), SILOOP);
4850  }
4851 }
4852 
4855  static const std::pair<int, const char *> TargetIndices[] = {
4856  {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
4857  {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
4858  {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
4859  {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
4860  {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
4861  return makeArrayRef(TargetIndices);
4862 }
4863 
4864 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
4865 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
4868  const ScheduleDAG *DAG) const {
4869  return new GCNHazardRecognizer(DAG->MF);
4870 }
4871 
4872 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
4873 /// pass.
4876  return new GCNHazardRecognizer(MF);
4877 }
4878 
4879 std::pair<unsigned, unsigned>
4881  return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
4882 }
4883 
4886  static const std::pair<unsigned, const char *> TargetFlags[] = {
4887  { MO_GOTPCREL, "amdgpu-gotprel" },
4888  { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
4889  { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
4890  { MO_REL32_LO, "amdgpu-rel32-lo" },
4891  { MO_REL32_HI, "amdgpu-rel32-hi" }
4892  };
4893 
4894  return makeArrayRef(TargetFlags);
4895 }
4896 
4898  return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
4899  MI.modifiesRegister(AMDGPU::EXEC, &RI);
4900 }
4901 
4905  const DebugLoc &DL,
4906  unsigned DestReg) const {
4907  if (ST.hasAddNoCarry())
4908  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
4909 
4910  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4911  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4912  MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC);
4913 
4914  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
4915  .addReg(UnusedCarry, RegState::Define | RegState::Dead);
4916 }
4917 
4918 bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
4919  switch (Opcode) {
4920  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
4921  case AMDGPU::SI_KILL_I1_TERMINATOR:
4922  return true;
4923  default:
4924  return false;
4925  }
4926 }
4927 
4929  switch (Opcode) {
4930  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
4931  return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
4932  case AMDGPU::SI_KILL_I1_PSEUDO:
4933  return get(AMDGPU::SI_KILL_I1_TERMINATOR);
4934  default:
4935  llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
4936  }
4937 }
4938 
4940  if (!isSMRD(MI))
4941  return false;
4942 
4943  // Check that it is using a buffer resource.
4944  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
4945  if (Idx == -1) // e.g. s_memtime
4946  return false;
4947 
4948  const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
4949  return RCID == AMDGPU::SReg_128RegClassID;
4950 }
unsigned getTargetFlags() const
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
uint64_t CallInst * C
unsigned getNumImplicitUses() const
Return the number of implicit uses this instruction has.
Definition: MCInstrDesc.h:516
unsigned getVALUOp(const MachineInstr &MI) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
void legalizeOperands(MachineInstr &MI) const
Legalize all operands in this instruction.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isVGPRSpillingEnabled(const Function &F) const
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
Interface definition for SIRegisterInfo.
bool hasRegisterImplicitUseOperand(unsigned Reg) const
Returns true if the MachineInstr has an implicit-use operand of exactly the given register (not consi...
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DstReg, ArrayRef< MachineOperand > Cond, unsigned TrueReg, unsigned FalseReg) const override
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register...
bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const
bool contains(unsigned Reg) const
Return true if the specified register is included in this register class.
Diagnostic information for unsupported feature in backend.
AMDGPU specific subclass of TargetSubtarget.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool IsDead
instr_iterator instr_end()
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:586
MachineBasicBlock * getMBB() const
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
LLVM_NODISCARD T pop_back_val()
Definition: SetVector.h:228
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
static bool sopkIsZext(const MachineInstr &MI)
Definition: SIInstrInfo.h:540
uint64_t getDefaultRsrcDataFormat() const
void ChangeToRegister(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value...
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:137
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static bool isStride64(unsigned Opc)
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, unsigned Offset, unsigned Size) const
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:285
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:161
unsigned getReg() const
getReg - Returns the register number.
void setIsUndef(bool Val=true)
unsigned insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned SrcReg, int Value) const
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
unsigned Reg
unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
unsigned getSubReg() const
bool isInlineAsm() const
Definition: MachineInstr.h:867
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:342
unsigned getRegBitWidth(unsigned RCID)
Get the size in bits of a register from the register class RC.
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
uint64_t getSize() const
Return the size in bytes of the memory reference.
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:803
MachineBasicBlock reference.
constexpr bool isInt< 16 >(int64_t x)
Definition: MathExtras.h:304
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi)
iterator_range< mmo_iterator > memoperands()
Definition: MachineInstr.h:423
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc)
static SDValue findChainOperand(SDNode *Load)
Definition: SIInstrInfo.cpp:87
bool isInlineConstant(const APInt &Imm) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
unsigned getSpillSize(const TargetRegisterClass &RC) const
Return the size in bytes of the stack slot allocated to hold a spilled copy of a register from class ...
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:414
void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1493
return AArch64::GPR64RegClass contains(Reg)
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction...
Definition: MachineInstr.h:552
static MachineOperand CreateReg(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static unsigned getAddrSpace(StringRef R)
Definition: DataLayout.cpp:228
bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi)
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:558
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:570
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
The main low level interface to the alias analysis implementation.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
A description of a memory reference used in the backend.
static use_iterator use_end()
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
bool hasInv2PiInlineImm() const
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:361
static ManagedStatic< DebugCounter > DC
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:208
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
const HexagonInstrInfo * TII