LLVM  8.0.0svn
SIInstrInfo.cpp
Go to the documentation of this file.
1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// SI Implementation of TargetInstrInfo.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "SIInstrInfo.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "GCNHazardRecognizer.h"
20 #include "SIDefines.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "SIRegisterInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/APInt.h"
26 #include "llvm/ADT/ArrayRef.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/StringRef.h"
47 #include "llvm/IR/DebugLoc.h"
48 #include "llvm/IR/DiagnosticInfo.h"
49 #include "llvm/IR/Function.h"
50 #include "llvm/IR/InlineAsm.h"
51 #include "llvm/IR/LLVMContext.h"
52 #include "llvm/MC/MCInstrDesc.h"
53 #include "llvm/Support/Casting.h"
55 #include "llvm/Support/Compiler.h"
60 #include <cassert>
61 #include <cstdint>
62 #include <iterator>
63 #include <utility>
64 
65 using namespace llvm;
66 
67 #define GET_INSTRINFO_CTOR_DTOR
68 #include "AMDGPUGenInstrInfo.inc"
69 
70 namespace llvm {
71 namespace AMDGPU {
72 #define GET_D16ImageDimIntrinsics_IMPL
73 #define GET_ImageDimIntrinsicTable_IMPL
74 #define GET_RsrcIntrinsics_IMPL
75 #include "AMDGPUGenSearchableTables.inc"
76 }
77 }
78 
79 
80 // Must be at least 4 to be able to branch over minimum unconditional branch
81 // code. This is only for making it possible to write reasonably small tests for
82 // long branches.
83 static cl::opt<unsigned>
84 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
85  cl::desc("Restrict range of branch instructions (DEBUG)"));
86 
88  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
89  RI(ST), ST(ST) {}
90 
91 //===----------------------------------------------------------------------===//
92 // TargetInstrInfo callbacks
93 //===----------------------------------------------------------------------===//
94 
95 static unsigned getNumOperandsNoGlue(SDNode *Node) {
96  unsigned N = Node->getNumOperands();
97  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
98  --N;
99  return N;
100 }
101 
103  SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
104  assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
105  return LastOp;
106 }
107 
108 /// Returns true if both nodes have the same value for the given
109 /// operand \p Op, or if both nodes do not have this operand.
110 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
111  unsigned Opc0 = N0->getMachineOpcode();
112  unsigned Opc1 = N1->getMachineOpcode();
113 
114  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
115  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
116 
117  if (Op0Idx == -1 && Op1Idx == -1)
118  return true;
119 
120 
121  if ((Op0Idx == -1 && Op1Idx != -1) ||
122  (Op1Idx == -1 && Op0Idx != -1))
123  return false;
124 
125  // getNamedOperandIdx returns the index for the MachineInstr's operands,
126  // which includes the result as the first operand. We are indexing into the
127  // MachineSDNode's operands, so we need to skip the result operand to get
128  // the real index.
129  --Op0Idx;
130  --Op1Idx;
131 
132  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
133 }
134 
136  AliasAnalysis *AA) const {
137  // TODO: The generic check fails for VALU instructions that should be
138  // rematerializable due to implicit reads of exec. We really want all of the
139  // generic logic for this except for this.
140  switch (MI.getOpcode()) {
141  case AMDGPU::V_MOV_B32_e32:
142  case AMDGPU::V_MOV_B32_e64:
143  case AMDGPU::V_MOV_B64_PSEUDO:
144  return true;
145  default:
146  return false;
147  }
148 }
149 
151  int64_t &Offset0,
152  int64_t &Offset1) const {
153  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
154  return false;
155 
156  unsigned Opc0 = Load0->getMachineOpcode();
157  unsigned Opc1 = Load1->getMachineOpcode();
158 
159  // Make sure both are actually loads.
160  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
161  return false;
162 
163  if (isDS(Opc0) && isDS(Opc1)) {
164 
165  // FIXME: Handle this case:
166  if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
167  return false;
168 
169  // Check base reg.
170  if (Load0->getOperand(1) != Load1->getOperand(1))
171  return false;
172 
173  // Check chain.
174  if (findChainOperand(Load0) != findChainOperand(Load1))
175  return false;
176 
177  // Skip read2 / write2 variants for simplicity.
178  // TODO: We should report true if the used offsets are adjacent (excluded
179  // st64 versions).
180  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
181  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
182  return false;
183 
184  Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
185  Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
186  return true;
187  }
188 
189  if (isSMRD(Opc0) && isSMRD(Opc1)) {
190  // Skip time and cache invalidation instructions.
191  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
192  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
193  return false;
194 
196 
197  // Check base reg.
198  if (Load0->getOperand(0) != Load1->getOperand(0))
199  return false;
200 
201  const ConstantSDNode *Load0Offset =
202  dyn_cast<ConstantSDNode>(Load0->getOperand(1));
203  const ConstantSDNode *Load1Offset =
204  dyn_cast<ConstantSDNode>(Load1->getOperand(1));
205 
206  if (!Load0Offset || !Load1Offset)
207  return false;
208 
209  // Check chain.
210  if (findChainOperand(Load0) != findChainOperand(Load1))
211  return false;
212 
213  Offset0 = Load0Offset->getZExtValue();
214  Offset1 = Load1Offset->getZExtValue();
215  return true;
216  }
217 
218  // MUBUF and MTBUF can access the same addresses.
219  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
220 
221  // MUBUF and MTBUF have vaddr at different indices.
222  if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
223  findChainOperand(Load0) != findChainOperand(Load1) ||
224  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
225  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
226  return false;
227 
228  int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
229  int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
230 
231  if (OffIdx0 == -1 || OffIdx1 == -1)
232  return false;
233 
234  // getNamedOperandIdx returns the index for MachineInstrs. Since they
235  // inlcude the output in the operand list, but SDNodes don't, we need to
236  // subtract the index by one.
237  --OffIdx0;
238  --OffIdx1;
239 
240  SDValue Off0 = Load0->getOperand(OffIdx0);
241  SDValue Off1 = Load1->getOperand(OffIdx1);
242 
243  // The offset might be a FrameIndexSDNode.
244  if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
245  return false;
246 
247  Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
248  Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
249  return true;
250  }
251 
252  return false;
253 }
254 
255 static bool isStride64(unsigned Opc) {
256  switch (Opc) {
257  case AMDGPU::DS_READ2ST64_B32:
258  case AMDGPU::DS_READ2ST64_B64:
259  case AMDGPU::DS_WRITE2ST64_B32:
260  case AMDGPU::DS_WRITE2ST64_B64:
261  return true;
262  default:
263  return false;
264  }
265 }
266 
267 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
268  int64_t &Offset,
269  const TargetRegisterInfo *TRI) const {
270  unsigned Opc = LdSt.getOpcode();
271 
272  if (isDS(LdSt)) {
273  const MachineOperand *OffsetImm =
274  getNamedOperand(LdSt, AMDGPU::OpName::offset);
275  if (OffsetImm) {
276  // Normal, single offset LDS instruction.
277  const MachineOperand *AddrReg =
278  getNamedOperand(LdSt, AMDGPU::OpName::addr);
279 
280  BaseReg = AddrReg->getReg();
281  Offset = OffsetImm->getImm();
282  return true;
283  }
284 
285  // The 2 offset instructions use offset0 and offset1 instead. We can treat
286  // these as a load with a single offset if the 2 offsets are consecutive. We
287  // will use this for some partially aligned loads.
288  const MachineOperand *Offset0Imm =
289  getNamedOperand(LdSt, AMDGPU::OpName::offset0);
290  const MachineOperand *Offset1Imm =
291  getNamedOperand(LdSt, AMDGPU::OpName::offset1);
292 
293  uint8_t Offset0 = Offset0Imm->getImm();
294  uint8_t Offset1 = Offset1Imm->getImm();
295 
296  if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
297  // Each of these offsets is in element sized units, so we need to convert
298  // to bytes of the individual reads.
299 
300  unsigned EltSize;
301  if (LdSt.mayLoad())
302  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
303  else {
304  assert(LdSt.mayStore());
305  int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
306  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
307  }
308 
309  if (isStride64(Opc))
310  EltSize *= 64;
311 
312  const MachineOperand *AddrReg =
313  getNamedOperand(LdSt, AMDGPU::OpName::addr);
314  BaseReg = AddrReg->getReg();
315  Offset = EltSize * Offset0;
316  return true;
317  }
318 
319  return false;
320  }
321 
322  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
323  const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
324  if (SOffset && SOffset->isReg())
325  return false;
326 
327  const MachineOperand *AddrReg =
328  getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
329  if (!AddrReg)
330  return false;
331 
332  const MachineOperand *OffsetImm =
333  getNamedOperand(LdSt, AMDGPU::OpName::offset);
334  BaseReg = AddrReg->getReg();
335  Offset = OffsetImm->getImm();
336 
337  if (SOffset) // soffset can be an inline immediate.
338  Offset += SOffset->getImm();
339 
340  return true;
341  }
342 
343  if (isSMRD(LdSt)) {
344  const MachineOperand *OffsetImm =
345  getNamedOperand(LdSt, AMDGPU::OpName::offset);
346  if (!OffsetImm)
347  return false;
348 
349  const MachineOperand *SBaseReg =
350  getNamedOperand(LdSt, AMDGPU::OpName::sbase);
351  BaseReg = SBaseReg->getReg();
352  Offset = OffsetImm->getImm();
353  return true;
354  }
355 
356  if (isFLAT(LdSt)) {
357  const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
358  if (VAddr) {
359  // Can't analyze 2 offsets.
360  if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
361  return false;
362 
363  BaseReg = VAddr->getReg();
364  } else {
365  // scratch instructions have either vaddr or saddr.
366  BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
367  }
368 
369  Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
370  return true;
371  }
372 
373  return false;
374 }
375 
376 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
377  const MachineInstr &MI2, unsigned BaseReg2) {
378  if (BaseReg1 == BaseReg2)
379  return true;
380 
381  if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
382  return false;
383 
384  auto MO1 = *MI1.memoperands_begin();
385  auto MO2 = *MI2.memoperands_begin();
386  if (MO1->getAddrSpace() != MO2->getAddrSpace())
387  return false;
388 
389  auto Base1 = MO1->getValue();
390  auto Base2 = MO2->getValue();
391  if (!Base1 || !Base2)
392  return false;
393  const MachineFunction &MF = *MI1.getParent()->getParent();
394  const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
395  Base1 = GetUnderlyingObject(Base1, DL);
396  Base2 = GetUnderlyingObject(Base1, DL);
397 
398  if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
399  return false;
400 
401  return Base1 == Base2;
402 }
403 
405  unsigned BaseReg1,
406  MachineInstr &SecondLdSt,
407  unsigned BaseReg2,
408  unsigned NumLoads) const {
409  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
410  return false;
411 
412  const MachineOperand *FirstDst = nullptr;
413  const MachineOperand *SecondDst = nullptr;
414 
415  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
416  (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
417  (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
418  const unsigned MaxGlobalLoadCluster = 6;
419  if (NumLoads > MaxGlobalLoadCluster)
420  return false;
421 
422  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
423  if (!FirstDst)
424  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
425  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
426  if (!SecondDst)
427  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
428  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
429  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
430  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
431  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
432  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
433  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
434  }
435 
436  if (!FirstDst || !SecondDst)
437  return false;
438 
439  // Try to limit clustering based on the total number of bytes loaded
440  // rather than the number of instructions. This is done to help reduce
441  // register pressure. The method used is somewhat inexact, though,
442  // because it assumes that all loads in the cluster will load the
443  // same number of bytes as FirstLdSt.
444 
445  // The unit of this value is bytes.
446  // FIXME: This needs finer tuning.
447  unsigned LoadClusterThreshold = 16;
448 
449  const MachineRegisterInfo &MRI =
450  FirstLdSt.getParent()->getParent()->getRegInfo();
451  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
452 
453  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
454 }
455 
456 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
457 // the first 16 loads will be interleaved with the stores, and the next 16 will
458 // be clustered as expected. It should really split into 2 16 store batches.
459 //
460 // Loads are clustered until this returns false, rather than trying to schedule
461 // groups of stores. This also means we have to deal with saying different
462 // address space loads should be clustered, and ones which might cause bank
463 // conflicts.
464 //
465 // This might be deprecated so it might not be worth that much effort to fix.
467  int64_t Offset0, int64_t Offset1,
468  unsigned NumLoads) const {
469  assert(Offset1 > Offset0 &&
470  "Second offset should be larger than first offset!");
471  // If we have less than 16 loads in a row, and the offsets are within 64
472  // bytes, then schedule together.
473 
474  // A cacheline is 64 bytes (for global memory).
475  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
476 }
477 
480  const DebugLoc &DL, unsigned DestReg,
481  unsigned SrcReg, bool KillSrc) {
482  MachineFunction *MF = MBB.getParent();
483  DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
484  "illegal SGPR to VGPR copy",
485  DL, DS_Error);
486  LLVMContext &C = MF->getFunction().getContext();
487  C.diagnose(IllegalCopy);
488 
489  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
490  .addReg(SrcReg, getKillRegState(KillSrc));
491 }
492 
495  const DebugLoc &DL, unsigned DestReg,
496  unsigned SrcReg, bool KillSrc) const {
497  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
498 
499  if (RC == &AMDGPU::VGPR_32RegClass) {
500  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
501  AMDGPU::SReg_32RegClass.contains(SrcReg));
502  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
503  .addReg(SrcReg, getKillRegState(KillSrc));
504  return;
505  }
506 
507  if (RC == &AMDGPU::SReg_32_XM0RegClass ||
508  RC == &AMDGPU::SReg_32RegClass) {
509  if (SrcReg == AMDGPU::SCC) {
510  BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
511  .addImm(-1)
512  .addImm(0);
513  return;
514  }
515 
516  if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
517  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
518  return;
519  }
520 
521  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
522  .addReg(SrcReg, getKillRegState(KillSrc));
523  return;
524  }
525 
526  if (RC == &AMDGPU::SReg_64RegClass) {
527  if (DestReg == AMDGPU::VCC) {
528  if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
529  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
530  .addReg(SrcReg, getKillRegState(KillSrc));
531  } else {
532  // FIXME: Hack until VReg_1 removed.
533  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
534  BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
535  .addImm(0)
536  .addReg(SrcReg, getKillRegState(KillSrc));
537  }
538 
539  return;
540  }
541 
542  if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
543  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
544  return;
545  }
546 
547  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
548  .addReg(SrcReg, getKillRegState(KillSrc));
549  return;
550  }
551 
552  if (DestReg == AMDGPU::SCC) {
553  assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
554  BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
555  .addReg(SrcReg, getKillRegState(KillSrc))
556  .addImm(0);
557  return;
558  }
559 
560  unsigned EltSize = 4;
561  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
562  if (RI.isSGPRClass(RC)) {
563  if (RI.getRegSizeInBits(*RC) > 32) {
564  Opcode = AMDGPU::S_MOV_B64;
565  EltSize = 8;
566  } else {
567  Opcode = AMDGPU::S_MOV_B32;
568  EltSize = 4;
569  }
570 
571  if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
572  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
573  return;
574  }
575  }
576 
577  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
578  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
579 
580  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
581  unsigned SubIdx;
582  if (Forward)
583  SubIdx = SubIndices[Idx];
584  else
585  SubIdx = SubIndices[SubIndices.size() - Idx - 1];
586 
587  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
588  get(Opcode), RI.getSubReg(DestReg, SubIdx));
589 
590  Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
591 
592  if (Idx == 0)
593  Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
594 
595  bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
596  Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
597  }
598 }
599 
600 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
601  int NewOpc;
602 
603  // Try to map original to commuted opcode
604  NewOpc = AMDGPU::getCommuteRev(Opcode);
605  if (NewOpc != -1)
606  // Check if the commuted (REV) opcode exists on the target.
607  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
608 
609  // Try to map commuted to original opcode
610  NewOpc = AMDGPU::getCommuteOrig(Opcode);
611  if (NewOpc != -1)
612  // Check if the original (non-REV) opcode exists on the target.
613  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
614 
615  return Opcode;
616 }
617 
620  const DebugLoc &DL, unsigned DestReg,
621  int64_t Value) const {
623  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
624  if (RegClass == &AMDGPU::SReg_32RegClass ||
625  RegClass == &AMDGPU::SGPR_32RegClass ||
626  RegClass == &AMDGPU::SReg_32_XM0RegClass ||
627  RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
628  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
629  .addImm(Value);
630  return;
631  }
632 
633  if (RegClass == &AMDGPU::SReg_64RegClass ||
634  RegClass == &AMDGPU::SGPR_64RegClass ||
635  RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
636  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
637  .addImm(Value);
638  return;
639  }
640 
641  if (RegClass == &AMDGPU::VGPR_32RegClass) {
642  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
643  .addImm(Value);
644  return;
645  }
646  if (RegClass == &AMDGPU::VReg_64RegClass) {
647  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
648  .addImm(Value);
649  return;
650  }
651 
652  unsigned EltSize = 4;
653  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
654  if (RI.isSGPRClass(RegClass)) {
655  if (RI.getRegSizeInBits(*RegClass) > 32) {
656  Opcode = AMDGPU::S_MOV_B64;
657  EltSize = 8;
658  } else {
659  Opcode = AMDGPU::S_MOV_B32;
660  EltSize = 4;
661  }
662  }
663 
664  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
665  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
666  int64_t IdxValue = Idx == 0 ? Value : 0;
667 
668  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
669  get(Opcode), RI.getSubReg(DestReg, Idx));
670  Builder.addImm(IdxValue);
671  }
672 }
673 
674 const TargetRegisterClass *
676  return &AMDGPU::VGPR_32RegClass;
677 }
678 
681  const DebugLoc &DL, unsigned DstReg,
683  unsigned TrueReg,
684  unsigned FalseReg) const {
686  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
687  "Not a VGPR32 reg");
688 
689  if (Cond.size() == 1) {
690  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
691  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
692  .add(Cond[0]);
693  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
694  .addReg(FalseReg)
695  .addReg(TrueReg)
696  .addReg(SReg);
697  } else if (Cond.size() == 2) {
698  assert(Cond[0].isImm() && "Cond[0] is not an immediate");
699  switch (Cond[0].getImm()) {
700  case SIInstrInfo::SCC_TRUE: {
701  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
702  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
703  .addImm(-1)
704  .addImm(0);
705  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
706  .addReg(FalseReg)
707  .addReg(TrueReg)
708  .addReg(SReg);
709  break;
710  }
711  case SIInstrInfo::SCC_FALSE: {
712  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
713  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
714  .addImm(0)
715  .addImm(-1);
716  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
717  .addReg(FalseReg)
718  .addReg(TrueReg)
719  .addReg(SReg);
720  break;
721  }
722  case SIInstrInfo::VCCNZ: {
723  MachineOperand RegOp = Cond[1];
724  RegOp.setImplicit(false);
725  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
726  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
727  .add(RegOp);
728  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
729  .addReg(FalseReg)
730  .addReg(TrueReg)
731  .addReg(SReg);
732  break;
733  }
734  case SIInstrInfo::VCCZ: {
735  MachineOperand RegOp = Cond[1];
736  RegOp.setImplicit(false);
737  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
738  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
739  .add(RegOp);
740  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
741  .addReg(TrueReg)
742  .addReg(FalseReg)
743  .addReg(SReg);
744  break;
745  }
746  case SIInstrInfo::EXECNZ: {
747  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
748  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
749  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
750  .addImm(0);
751  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
752  .addImm(-1)
753  .addImm(0);
754  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
755  .addReg(FalseReg)
756  .addReg(TrueReg)
757  .addReg(SReg);
758  break;
759  }
760  case SIInstrInfo::EXECZ: {
761  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
762  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
763  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
764  .addImm(0);
765  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
766  .addImm(0)
767  .addImm(-1);
768  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
769  .addReg(FalseReg)
770  .addReg(TrueReg)
771  .addReg(SReg);
772  llvm_unreachable("Unhandled branch predicate EXECZ");
773  break;
774  }
775  default:
776  llvm_unreachable("invalid branch predicate");
777  }
778  } else {
779  llvm_unreachable("Can only handle Cond size 1 or 2");
780  }
781 }
782 
785  const DebugLoc &DL,
786  unsigned SrcReg, int Value) const {
788  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
789  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
790  .addImm(Value)
791  .addReg(SrcReg);
792 
793  return Reg;
794 }
795 
798  const DebugLoc &DL,
799  unsigned SrcReg, int Value) const {
801  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
802  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
803  .addImm(Value)
804  .addReg(SrcReg);
805 
806  return Reg;
807 }
808 
809 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
810 
811  if (RI.getRegSizeInBits(*DstRC) == 32) {
812  return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
813  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
814  return AMDGPU::S_MOV_B64;
815  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
816  return AMDGPU::V_MOV_B64_PSEUDO;
817  }
818  return AMDGPU::COPY;
819 }
820 
821 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
822  switch (Size) {
823  case 4:
824  return AMDGPU::SI_SPILL_S32_SAVE;
825  case 8:
826  return AMDGPU::SI_SPILL_S64_SAVE;
827  case 16:
828  return AMDGPU::SI_SPILL_S128_SAVE;
829  case 32:
830  return AMDGPU::SI_SPILL_S256_SAVE;
831  case 64:
832  return AMDGPU::SI_SPILL_S512_SAVE;
833  default:
834  llvm_unreachable("unknown register size");
835  }
836 }
837 
838 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
839  switch (Size) {
840  case 4:
841  return AMDGPU::SI_SPILL_V32_SAVE;
842  case 8:
843  return AMDGPU::SI_SPILL_V64_SAVE;
844  case 12:
845  return AMDGPU::SI_SPILL_V96_SAVE;
846  case 16:
847  return AMDGPU::SI_SPILL_V128_SAVE;
848  case 32:
849  return AMDGPU::SI_SPILL_V256_SAVE;
850  case 64:
851  return AMDGPU::SI_SPILL_V512_SAVE;
852  default:
853  llvm_unreachable("unknown register size");
854  }
855 }
856 
859  unsigned SrcReg, bool isKill,
860  int FrameIndex,
861  const TargetRegisterClass *RC,
862  const TargetRegisterInfo *TRI) const {
863  MachineFunction *MF = MBB.getParent();
865  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
866  DebugLoc DL = MBB.findDebugLoc(MI);
867 
868  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
869  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
870  MachinePointerInfo PtrInfo
871  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
872  MachineMemOperand *MMO
874  Size, Align);
875  unsigned SpillSize = TRI->getSpillSize(*RC);
876 
877  if (RI.isSGPRClass(RC)) {
878  MFI->setHasSpilledSGPRs();
879 
880  // We are only allowed to create one new instruction when spilling
881  // registers, so we need to use pseudo instruction for spilling SGPRs.
882  const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
883 
884  // The SGPR spill/restore instructions only work on number sgprs, so we need
885  // to make sure we are using the correct register class.
886  if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
888  MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
889  }
890 
891  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
892  .addReg(SrcReg, getKillRegState(isKill)) // data
893  .addFrameIndex(FrameIndex) // addr
894  .addMemOperand(MMO)
896  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
897  // Add the scratch resource registers as implicit uses because we may end up
898  // needing them, and need to ensure that the reserved registers are
899  // correctly handled.
900 
901  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
902  if (ST.hasScalarStores()) {
903  // m0 is used for offset to scalar stores if used to spill.
904  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
905  }
906 
907  return;
908  }
909 
910  if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
911  LLVMContext &Ctx = MF->getFunction().getContext();
912  Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
913  " spill register");
914  BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
915  .addReg(SrcReg);
916 
917  return;
918  }
919 
920  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
921 
922  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
923  MFI->setHasSpilledVGPRs();
924  BuildMI(MBB, MI, DL, get(Opcode))
925  .addReg(SrcReg, getKillRegState(isKill)) // data
926  .addFrameIndex(FrameIndex) // addr
927  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
928  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
929  .addImm(0) // offset
930  .addMemOperand(MMO);
931 }
932 
933 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
934  switch (Size) {
935  case 4:
936  return AMDGPU::SI_SPILL_S32_RESTORE;
937  case 8:
938  return AMDGPU::SI_SPILL_S64_RESTORE;
939  case 16:
940  return AMDGPU::SI_SPILL_S128_RESTORE;
941  case 32:
942  return AMDGPU::SI_SPILL_S256_RESTORE;
943  case 64:
944  return AMDGPU::SI_SPILL_S512_RESTORE;
945  default:
946  llvm_unreachable("unknown register size");
947  }
948 }
949 
950 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
951  switch (Size) {
952  case 4:
953  return AMDGPU::SI_SPILL_V32_RESTORE;
954  case 8:
955  return AMDGPU::SI_SPILL_V64_RESTORE;
956  case 12:
957  return AMDGPU::SI_SPILL_V96_RESTORE;
958  case 16:
959  return AMDGPU::SI_SPILL_V128_RESTORE;
960  case 32:
961  return AMDGPU::SI_SPILL_V256_RESTORE;
962  case 64:
963  return AMDGPU::SI_SPILL_V512_RESTORE;
964  default:
965  llvm_unreachable("unknown register size");
966  }
967 }
968 
971  unsigned DestReg, int FrameIndex,
972  const TargetRegisterClass *RC,
973  const TargetRegisterInfo *TRI) const {
974  MachineFunction *MF = MBB.getParent();
976  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
977  DebugLoc DL = MBB.findDebugLoc(MI);
978  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
979  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
980  unsigned SpillSize = TRI->getSpillSize(*RC);
981 
982  MachinePointerInfo PtrInfo
983  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
984 
986  PtrInfo, MachineMemOperand::MOLoad, Size, Align);
987 
988  if (RI.isSGPRClass(RC)) {
989  // FIXME: Maybe this should not include a memoperand because it will be
990  // lowered to non-memory instructions.
991  const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
992  if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
994  MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
995  }
996 
997  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
998  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
999  .addFrameIndex(FrameIndex) // addr
1000  .addMemOperand(MMO)
1002  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
1003 
1004  if (ST.hasScalarStores()) {
1005  // m0 is used for offset to scalar stores if used to spill.
1006  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
1007  }
1008 
1009  return;
1010  }
1011 
1012  if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
1013  LLVMContext &Ctx = MF->getFunction().getContext();
1014  Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
1015  " restore register");
1016  BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
1017 
1018  return;
1019  }
1020 
1021  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
1022 
1023  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
1024  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1025  .addFrameIndex(FrameIndex) // vaddr
1026  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
1027  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
1028  .addImm(0) // offset
1029  .addMemOperand(MMO);
1030 }
1031 
1032 /// \param @Offset Offset in bytes of the FrameIndex being spilled
1034  MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
1035  unsigned FrameOffset, unsigned Size) const {
1036  MachineFunction *MF = MBB.getParent();
1038  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1039  DebugLoc DL = MBB.findDebugLoc(MI);
1040  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
1041  unsigned WavefrontSize = ST.getWavefrontSize();
1042 
1043  unsigned TIDReg = MFI->getTIDReg();
1044  if (!MFI->hasCalculatedTID()) {
1045  MachineBasicBlock &Entry = MBB.getParent()->front();
1046  MachineBasicBlock::iterator Insert = Entry.front();
1047  DebugLoc DL = Insert->getDebugLoc();
1048 
1049  TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
1050  *MF);
1051  if (TIDReg == AMDGPU::NoRegister)
1052  return TIDReg;
1053 
1055  WorkGroupSize > WavefrontSize) {
1056  unsigned TIDIGXReg
1058  unsigned TIDIGYReg
1060  unsigned TIDIGZReg
1062  unsigned InputPtrReg =
1064  for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
1065  if (!Entry.isLiveIn(Reg))
1066  Entry.addLiveIn(Reg);
1067  }
1068 
1069  RS->enterBasicBlock(Entry);
1070  // FIXME: Can we scavenge an SReg_64 and access the subregs?
1071  unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1072  unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1073  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
1074  .addReg(InputPtrReg)
1076  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
1077  .addReg(InputPtrReg)
1079 
1080  // NGROUPS.X * NGROUPS.Y
1081  BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
1082  .addReg(STmp1)
1083  .addReg(STmp0);
1084  // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1085  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1086  .addReg(STmp1)
1087  .addReg(TIDIGXReg);
1088  // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1089  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1090  .addReg(STmp0)
1091  .addReg(TIDIGYReg)
1092  .addReg(TIDReg);
1093  // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1094  getAddNoCarry(Entry, Insert, DL, TIDReg)
1095  .addReg(TIDReg)
1096  .addReg(TIDIGZReg);
1097  } else {
1098  // Get the wave id
1099  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1100  TIDReg)
1101  .addImm(-1)
1102  .addImm(0);
1103 
1104  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1105  TIDReg)
1106  .addImm(-1)
1107  .addReg(TIDReg);
1108  }
1109 
1110  BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1111  TIDReg)
1112  .addImm(2)
1113  .addReg(TIDReg);
1114  MFI->setTIDReg(TIDReg);
1115  }
1116 
1117  // Add FrameIndex to LDS offset
1118  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1119  getAddNoCarry(MBB, MI, DL, TmpReg)
1120  .addImm(LDSOffset)
1121  .addReg(TIDReg);
1122 
1123  return TmpReg;
1124 }
1125 
1128  int Count) const {
1129  DebugLoc DL = MBB.findDebugLoc(MI);
1130  while (Count > 0) {
1131  int Arg;
1132  if (Count >= 8)
1133  Arg = 7;
1134  else
1135  Arg = Count - 1;
1136  Count -= 8;
1137  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1138  .addImm(Arg);
1139  }
1140 }
1141 
1144  insertWaitStates(MBB, MI, 1);
1145 }
1146 
1148  auto MF = MBB.getParent();
1149  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1150 
1151  assert(Info->isEntryFunction());
1152 
1153  if (MBB.succ_empty()) {
1154  bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1155  if (HasNoTerminator)
1156  BuildMI(MBB, MBB.end(), DebugLoc(),
1157  get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
1158  }
1159 }
1160 
1162  switch (MI.getOpcode()) {
1163  default: return 1; // FIXME: Do wait states equal cycles?
1164 
1165  case AMDGPU::S_NOP:
1166  return MI.getOperand(0).getImm() + 1;
1167  }
1168 }
1169 
1171  MachineBasicBlock &MBB = *MI.getParent();
1172  DebugLoc DL = MBB.findDebugLoc(MI);
1173  switch (MI.getOpcode()) {
1174  default: return TargetInstrInfo::expandPostRAPseudo(MI);
1175  case AMDGPU::S_MOV_B64_term:
1176  // This is only a terminator to get the correct spill code placement during
1177  // register allocation.
1178  MI.setDesc(get(AMDGPU::S_MOV_B64));
1179  break;
1180 
1181  case AMDGPU::S_XOR_B64_term:
1182  // This is only a terminator to get the correct spill code placement during
1183  // register allocation.
1184  MI.setDesc(get(AMDGPU::S_XOR_B64));
1185  break;
1186 
1187  case AMDGPU::S_ANDN2_B64_term:
1188  // This is only a terminator to get the correct spill code placement during
1189  // register allocation.
1190  MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1191  break;
1192 
1193  case AMDGPU::V_MOV_B64_PSEUDO: {
1194  unsigned Dst = MI.getOperand(0).getReg();
1195  unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1196  unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1197 
1198  const MachineOperand &SrcOp = MI.getOperand(1);
1199  // FIXME: Will this work for 64-bit floating point immediates?
1200  assert(!SrcOp.isFPImm());
1201  if (SrcOp.isImm()) {
1202  APInt Imm(64, SrcOp.getImm());
1203  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1204  .addImm(Imm.getLoBits(32).getZExtValue())
1205  .addReg(Dst, RegState::Implicit | RegState::Define);
1206  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1207  .addImm(Imm.getHiBits(32).getZExtValue())
1208  .addReg(Dst, RegState::Implicit | RegState::Define);
1209  } else {
1210  assert(SrcOp.isReg());
1211  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1212  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1214  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1215  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1217  }
1218  MI.eraseFromParent();
1219  break;
1220  }
1221  case AMDGPU::V_SET_INACTIVE_B32: {
1222  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1223  .addReg(AMDGPU::EXEC);
1224  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1225  .add(MI.getOperand(2));
1226  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1227  .addReg(AMDGPU::EXEC);
1228  MI.eraseFromParent();
1229  break;
1230  }
1231  case AMDGPU::V_SET_INACTIVE_B64: {
1232  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1233  .addReg(AMDGPU::EXEC);
1234  MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1235  MI.getOperand(0).getReg())
1236  .add(MI.getOperand(2));
1237  expandPostRAPseudo(*Copy);
1238  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1239  .addReg(AMDGPU::EXEC);
1240  MI.eraseFromParent();
1241  break;
1242  }
1243  case AMDGPU::V_MOVRELD_B32_V1:
1244  case AMDGPU::V_MOVRELD_B32_V2:
1245  case AMDGPU::V_MOVRELD_B32_V4:
1246  case AMDGPU::V_MOVRELD_B32_V8:
1247  case AMDGPU::V_MOVRELD_B32_V16: {
1248  const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1249  unsigned VecReg = MI.getOperand(0).getReg();
1250  bool IsUndef = MI.getOperand(1).isUndef();
1251  unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1252  assert(VecReg == MI.getOperand(1).getReg());
1253 
1254  MachineInstr *MovRel =
1255  BuildMI(MBB, MI, DL, MovRelDesc)
1256  .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1257  .add(MI.getOperand(2))
1258  .addReg(VecReg, RegState::ImplicitDefine)
1259  .addReg(VecReg,
1260  RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1261 
1262  const int ImpDefIdx =
1263  MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1264  const int ImpUseIdx = ImpDefIdx + 1;
1265  MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1266 
1267  MI.eraseFromParent();
1268  break;
1269  }
1270  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1271  MachineFunction &MF = *MBB.getParent();
1272  unsigned Reg = MI.getOperand(0).getReg();
1273  unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1274  unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1275 
1276  // Create a bundle so these instructions won't be re-ordered by the
1277  // post-RA scheduler.
1278  MIBundleBuilder Bundler(MBB, MI);
1279  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1280 
1281  // Add 32-bit offset from this instruction to the start of the
1282  // constant data.
1283  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1284  .addReg(RegLo)
1285  .add(MI.getOperand(1)));
1286 
1287  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1288  .addReg(RegHi);
1290  MIB.addImm(0);
1291  else
1292  MIB.add(MI.getOperand(2));
1293 
1294  Bundler.append(MIB);
1295  finalizeBundle(MBB, Bundler.begin());
1296 
1297  MI.eraseFromParent();
1298  break;
1299  }
1300  case AMDGPU::EXIT_WWM: {
1301  // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
1302  // is exited.
1303  MI.setDesc(get(AMDGPU::S_MOV_B64));
1304  break;
1305  }
1306  case TargetOpcode::BUNDLE: {
1307  if (!MI.mayLoad())
1308  return false;
1309 
1310  // If it is a load it must be a memory clause
1312  I->isBundledWithSucc(); ++I) {
1313  I->unbundleFromSucc();
1314  for (MachineOperand &MO : I->operands())
1315  if (MO.isReg())
1316  MO.setIsInternalRead(false);
1317  }
1318 
1319  MI.eraseFromParent();
1320  break;
1321  }
1322  }
1323  return true;
1324 }
1325 
1327  MachineOperand &Src0,
1328  unsigned Src0OpName,
1329  MachineOperand &Src1,
1330  unsigned Src1OpName) const {
1331  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1332  if (!Src0Mods)
1333  return false;
1334 
1335  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1336  assert(Src1Mods &&
1337  "All commutable instructions have both src0 and src1 modifiers");
1338 
1339  int Src0ModsVal = Src0Mods->getImm();
1340  int Src1ModsVal = Src1Mods->getImm();
1341 
1342  Src1Mods->setImm(Src0ModsVal);
1343  Src0Mods->setImm(Src1ModsVal);
1344  return true;
1345 }
1346 
1348  MachineOperand &RegOp,
1349  MachineOperand &NonRegOp) {
1350  unsigned Reg = RegOp.getReg();
1351  unsigned SubReg = RegOp.getSubReg();
1352  bool IsKill = RegOp.isKill();
1353  bool IsDead = RegOp.isDead();
1354  bool IsUndef = RegOp.isUndef();
1355  bool IsDebug = RegOp.isDebug();
1356 
1357  if (NonRegOp.isImm())
1358  RegOp.ChangeToImmediate(NonRegOp.getImm());
1359  else if (NonRegOp.isFI())
1360  RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1361  else
1362  return nullptr;
1363 
1364  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1365  NonRegOp.setSubReg(SubReg);
1366 
1367  return &MI;
1368 }
1369 
1371  unsigned Src0Idx,
1372  unsigned Src1Idx) const {
1373  assert(!NewMI && "this should never be used");
1374 
1375  unsigned Opc = MI.getOpcode();
1376  int CommutedOpcode = commuteOpcode(Opc);
1377  if (CommutedOpcode == -1)
1378  return nullptr;
1379 
1380  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1381  static_cast<int>(Src0Idx) &&
1382  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1383  static_cast<int>(Src1Idx) &&
1384  "inconsistency with findCommutedOpIndices");
1385 
1386  MachineOperand &Src0 = MI.getOperand(Src0Idx);
1387  MachineOperand &Src1 = MI.getOperand(Src1Idx);
1388 
1389  MachineInstr *CommutedMI = nullptr;
1390  if (Src0.isReg() && Src1.isReg()) {
1391  if (isOperandLegal(MI, Src1Idx, &Src0)) {
1392  // Be sure to copy the source modifiers to the right place.
1393  CommutedMI
1394  = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1395  }
1396 
1397  } else if (Src0.isReg() && !Src1.isReg()) {
1398  // src0 should always be able to support any operand type, so no need to
1399  // check operand legality.
1400  CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1401  } else if (!Src0.isReg() && Src1.isReg()) {
1402  if (isOperandLegal(MI, Src1Idx, &Src0))
1403  CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1404  } else {
1405  // FIXME: Found two non registers to commute. This does happen.
1406  return nullptr;
1407  }
1408 
1409  if (CommutedMI) {
1410  swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1411  Src1, AMDGPU::OpName::src1_modifiers);
1412 
1413  CommutedMI->setDesc(get(CommutedOpcode));
1414  }
1415 
1416  return CommutedMI;
1417 }
1418 
1419 // This needs to be implemented because the source modifiers may be inserted
1420 // between the true commutable operands, and the base
1421 // TargetInstrInfo::commuteInstruction uses it.
1423  unsigned &SrcOpIdx1) const {
1424  if (!MI.isCommutable())
1425  return false;
1426 
1427  unsigned Opc = MI.getOpcode();
1428  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1429  if (Src0Idx == -1)
1430  return false;
1431 
1432  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1433  if (Src1Idx == -1)
1434  return false;
1435 
1436  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1437 }
1438 
1439 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1440  int64_t BrOffset) const {
1441  // BranchRelaxation should never have to check s_setpc_b64 because its dest
1442  // block is unanalyzable.
1443  assert(BranchOp != AMDGPU::S_SETPC_B64);
1444 
1445  // Convert to dwords.
1446  BrOffset /= 4;
1447 
1448  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1449  // from the next instruction.
1450  BrOffset -= 1;
1451 
1452  return isIntN(BranchOffsetBits, BrOffset);
1453 }
1454 
1456  const MachineInstr &MI) const {
1457  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1458  // This would be a difficult analysis to perform, but can always be legal so
1459  // there's no need to analyze it.
1460  return nullptr;
1461  }
1462 
1463  return MI.getOperand(0).getMBB();
1464 }
1465 
1467  MachineBasicBlock &DestBB,
1468  const DebugLoc &DL,
1469  int64_t BrOffset,
1470  RegScavenger *RS) const {
1471  assert(RS && "RegScavenger required for long branching");
1472  assert(MBB.empty() &&
1473  "new block should be inserted for expanding unconditional branch");
1474  assert(MBB.pred_size() == 1);
1475 
1476  MachineFunction *MF = MBB.getParent();
1477  MachineRegisterInfo &MRI = MF->getRegInfo();
1478 
1479  // FIXME: Virtual register workaround for RegScavenger not working with empty
1480  // blocks.
1481  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1482 
1483  auto I = MBB.end();
1484 
1485  // We need to compute the offset relative to the instruction immediately after
1486  // s_getpc_b64. Insert pc arithmetic code before last terminator.
1487  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1488 
1489  // TODO: Handle > 32-bit block address.
1490  if (BrOffset >= 0) {
1491  BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1492  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1493  .addReg(PCReg, 0, AMDGPU::sub0)
1495  BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1496  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1497  .addReg(PCReg, 0, AMDGPU::sub1)
1498  .addImm(0);
1499  } else {
1500  // Backwards branch.
1501  BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1502  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1503  .addReg(PCReg, 0, AMDGPU::sub0)
1505  BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1506  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1507  .addReg(PCReg, 0, AMDGPU::sub1)
1508  .addImm(0);
1509  }
1510 
1511  // Insert the indirect branch after the other terminator.
1512  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1513  .addReg(PCReg);
1514 
1515  // FIXME: If spilling is necessary, this will fail because this scavenger has
1516  // no emergency stack slots. It is non-trivial to spill in this situation,
1517  // because the restore code needs to be specially placed after the
1518  // jump. BranchRelaxation then needs to be made aware of the newly inserted
1519  // block.
1520  //
1521  // If a spill is needed for the pc register pair, we need to insert a spill
1522  // restore block right before the destination block, and insert a short branch
1523  // into the old destination block's fallthrough predecessor.
1524  // e.g.:
1525  //
1526  // s_cbranch_scc0 skip_long_branch:
1527  //
1528  // long_branch_bb:
1529  // spill s[8:9]
1530  // s_getpc_b64 s[8:9]
1531  // s_add_u32 s8, s8, restore_bb
1532  // s_addc_u32 s9, s9, 0
1533  // s_setpc_b64 s[8:9]
1534  //
1535  // skip_long_branch:
1536  // foo;
1537  //
1538  // .....
1539  //
1540  // dest_bb_fallthrough_predecessor:
1541  // bar;
1542  // s_branch dest_bb
1543  //
1544  // restore_bb:
1545  // restore s[8:9]
1546  // fallthrough dest_bb
1547  ///
1548  // dest_bb:
1549  // buzz;
1550 
1551  RS->enterBasicBlockEnd(MBB);
1552  unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
1553  MachineBasicBlock::iterator(GetPC), 0);
1554  MRI.replaceRegWith(PCReg, Scav);
1555  MRI.clearVirtRegs();
1556  RS->setRegUsed(Scav);
1557 
1558  return 4 + 8 + 4 + 4;
1559 }
1560 
1561 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1562  switch (Cond) {
1563  case SIInstrInfo::SCC_TRUE:
1564  return AMDGPU::S_CBRANCH_SCC1;
1565  case SIInstrInfo::SCC_FALSE:
1566  return AMDGPU::S_CBRANCH_SCC0;
1567  case SIInstrInfo::VCCNZ:
1568  return AMDGPU::S_CBRANCH_VCCNZ;
1569  case SIInstrInfo::VCCZ:
1570  return AMDGPU::S_CBRANCH_VCCZ;
1571  case SIInstrInfo::EXECNZ:
1572  return AMDGPU::S_CBRANCH_EXECNZ;
1573  case SIInstrInfo::EXECZ:
1574  return AMDGPU::S_CBRANCH_EXECZ;
1575  default:
1576  llvm_unreachable("invalid branch predicate");
1577  }
1578 }
1579 
1580 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1581  switch (Opcode) {
1582  case AMDGPU::S_CBRANCH_SCC0:
1583  return SCC_FALSE;
1584  case AMDGPU::S_CBRANCH_SCC1:
1585  return SCC_TRUE;
1586  case AMDGPU::S_CBRANCH_VCCNZ:
1587  return VCCNZ;
1588  case AMDGPU::S_CBRANCH_VCCZ:
1589  return VCCZ;
1590  case AMDGPU::S_CBRANCH_EXECNZ:
1591  return EXECNZ;
1592  case AMDGPU::S_CBRANCH_EXECZ:
1593  return EXECZ;
1594  default:
1595  return INVALID_BR;
1596  }
1597 }
1598 
1601  MachineBasicBlock *&TBB,
1602  MachineBasicBlock *&FBB,
1604  bool AllowModify) const {
1605  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1606  // Unconditional Branch
1607  TBB = I->getOperand(0).getMBB();
1608  return false;
1609  }
1610 
1611  MachineBasicBlock *CondBB = nullptr;
1612 
1613  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1614  CondBB = I->getOperand(1).getMBB();
1615  Cond.push_back(I->getOperand(0));
1616  } else {
1617  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1618  if (Pred == INVALID_BR)
1619  return true;
1620 
1621  CondBB = I->getOperand(0).getMBB();
1623  Cond.push_back(I->getOperand(1)); // Save the branch register.
1624  }
1625  ++I;
1626 
1627  if (I == MBB.end()) {
1628  // Conditional branch followed by fall-through.
1629  TBB = CondBB;
1630  return false;
1631  }
1632 
1633  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1634  TBB = CondBB;
1635  FBB = I->getOperand(0).getMBB();
1636  return false;
1637  }
1638 
1639  return true;
1640 }
1641 
1643  MachineBasicBlock *&FBB,
1645  bool AllowModify) const {
1647  if (I == MBB.end())
1648  return false;
1649 
1650  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1651  return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1652 
1653  ++I;
1654 
1655  // TODO: Should be able to treat as fallthrough?
1656  if (I == MBB.end())
1657  return true;
1658 
1659  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1660  return true;
1661 
1662  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1663 
1664  // Specifically handle the case where the conditional branch is to the same
1665  // destination as the mask branch. e.g.
1666  //
1667  // si_mask_branch BB8
1668  // s_cbranch_execz BB8
1669  // s_cbranch BB9
1670  //
1671  // This is required to understand divergent loops which may need the branches
1672  // to be relaxed.
1673  if (TBB != MaskBrDest || Cond.empty())
1674  return true;
1675 
1676  auto Pred = Cond[0].getImm();
1677  return (Pred != EXECZ && Pred != EXECNZ);
1678 }
1679 
1681  int *BytesRemoved) const {
1683 
1684  unsigned Count = 0;
1685  unsigned RemovedSize = 0;
1686  while (I != MBB.end()) {
1687  MachineBasicBlock::iterator Next = std::next(I);
1688  if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1689  I = Next;
1690  continue;
1691  }
1692 
1693  RemovedSize += getInstSizeInBytes(*I);
1694  I->eraseFromParent();
1695  ++Count;
1696  I = Next;
1697  }
1698 
1699  if (BytesRemoved)
1700  *BytesRemoved = RemovedSize;
1701 
1702  return Count;
1703 }
1704 
1705 // Copy the flags onto the implicit condition register operand.
1707  const MachineOperand &OrigCond) {
1708  CondReg.setIsUndef(OrigCond.isUndef());
1709  CondReg.setIsKill(OrigCond.isKill());
1710 }
1711 
1713  MachineBasicBlock *TBB,
1714  MachineBasicBlock *FBB,
1716  const DebugLoc &DL,
1717  int *BytesAdded) const {
1718  if (!FBB && Cond.empty()) {
1719  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1720  .addMBB(TBB);
1721  if (BytesAdded)
1722  *BytesAdded = 4;
1723  return 1;
1724  }
1725 
1726  if(Cond.size() == 1 && Cond[0].isReg()) {
1727  BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1728  .add(Cond[0])
1729  .addMBB(TBB);
1730  return 1;
1731  }
1732 
1733  assert(TBB && Cond[0].isImm());
1734 
1735  unsigned Opcode
1736  = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1737 
1738  if (!FBB) {
1739  Cond[1].isUndef();
1740  MachineInstr *CondBr =
1741  BuildMI(&MBB, DL, get(Opcode))
1742  .addMBB(TBB);
1743 
1744  // Copy the flags onto the implicit condition register operand.
1745  preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1746 
1747  if (BytesAdded)
1748  *BytesAdded = 4;
1749  return 1;
1750  }
1751 
1752  assert(TBB && FBB);
1753 
1754  MachineInstr *CondBr =
1755  BuildMI(&MBB, DL, get(Opcode))
1756  .addMBB(TBB);
1757  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1758  .addMBB(FBB);
1759 
1760  MachineOperand &CondReg = CondBr->getOperand(1);
1761  CondReg.setIsUndef(Cond[1].isUndef());
1762  CondReg.setIsKill(Cond[1].isKill());
1763 
1764  if (BytesAdded)
1765  *BytesAdded = 8;
1766 
1767  return 2;
1768 }
1769 
1771  SmallVectorImpl<MachineOperand> &Cond) const {
1772  if (Cond.size() != 2) {
1773  return true;
1774  }
1775 
1776  if (Cond[0].isImm()) {
1777  Cond[0].setImm(-Cond[0].getImm());
1778  return false;
1779  }
1780 
1781  return true;
1782 }
1783 
1786  unsigned TrueReg, unsigned FalseReg,
1787  int &CondCycles,
1788  int &TrueCycles, int &FalseCycles) const {
1789  switch (Cond[0].getImm()) {
1790  case VCCNZ:
1791  case VCCZ: {
1792  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1793  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1794  assert(MRI.getRegClass(FalseReg) == RC);
1795 
1796  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1797  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1798 
1799  // Limit to equal cost for branch vs. N v_cndmask_b32s.
1800  return !RI.isSGPRClass(RC) && NumInsts <= 6;
1801  }
1802  case SCC_TRUE:
1803  case SCC_FALSE: {
1804  // FIXME: We could insert for VGPRs if we could replace the original compare
1805  // with a vector one.
1806  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1807  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1808  assert(MRI.getRegClass(FalseReg) == RC);
1809 
1810  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1811 
1812  // Multiples of 8 can do s_cselect_b64
1813  if (NumInsts % 2 == 0)
1814  NumInsts /= 2;
1815 
1816  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1817  return RI.isSGPRClass(RC);
1818  }
1819  default:
1820  return false;
1821  }
1822 }
1823 
1826  unsigned DstReg, ArrayRef<MachineOperand> Cond,
1827  unsigned TrueReg, unsigned FalseReg) const {
1828  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1829  if (Pred == VCCZ || Pred == SCC_FALSE) {
1830  Pred = static_cast<BranchPredicate>(-Pred);
1831  std::swap(TrueReg, FalseReg);
1832  }
1833 
1835  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1836  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1837 
1838  if (DstSize == 32) {
1839  unsigned SelOp = Pred == SCC_TRUE ?
1840  AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1841 
1842  // Instruction's operands are backwards from what is expected.
1843  MachineInstr *Select =
1844  BuildMI(MBB, I, DL, get(SelOp), DstReg)
1845  .addReg(FalseReg)
1846  .addReg(TrueReg);
1847 
1848  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1849  return;
1850  }
1851 
1852  if (DstSize == 64 && Pred == SCC_TRUE) {
1853  MachineInstr *Select =
1854  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1855  .addReg(FalseReg)
1856  .addReg(TrueReg);
1857 
1858  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1859  return;
1860  }
1861 
1862  static const int16_t Sub0_15[] = {
1863  AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1864  AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1865  AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1866  AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1867  };
1868 
1869  static const int16_t Sub0_15_64[] = {
1870  AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1871  AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1872  AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1873  AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1874  };
1875 
1876  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1877  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1878  const int16_t *SubIndices = Sub0_15;
1879  int NElts = DstSize / 32;
1880 
1881  // 64-bit select is only avaialble for SALU.
1882  if (Pred == SCC_TRUE) {
1883  SelOp = AMDGPU::S_CSELECT_B64;
1884  EltRC = &AMDGPU::SGPR_64RegClass;
1885  SubIndices = Sub0_15_64;
1886 
1887  assert(NElts % 2 == 0);
1888  NElts /= 2;
1889  }
1890 
1892  MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1893 
1894  I = MIB->getIterator();
1895 
1897  for (int Idx = 0; Idx != NElts; ++Idx) {
1898  unsigned DstElt = MRI.createVirtualRegister(EltRC);
1899  Regs.push_back(DstElt);
1900 
1901  unsigned SubIdx = SubIndices[Idx];
1902 
1903  MachineInstr *Select =
1904  BuildMI(MBB, I, DL, get(SelOp), DstElt)
1905  .addReg(FalseReg, 0, SubIdx)
1906  .addReg(TrueReg, 0, SubIdx);
1907  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1908 
1909  MIB.addReg(DstElt)
1910  .addImm(SubIdx);
1911  }
1912 }
1913 
1915  switch (MI.getOpcode()) {
1916  case AMDGPU::V_MOV_B32_e32:
1917  case AMDGPU::V_MOV_B32_e64:
1918  case AMDGPU::V_MOV_B64_PSEUDO: {
1919  // If there are additional implicit register operands, this may be used for
1920  // register indexing so the source register operand isn't simply copied.
1921  unsigned NumOps = MI.getDesc().getNumOperands() +
1922  MI.getDesc().getNumImplicitUses();
1923 
1924  return MI.getNumOperands() == NumOps;
1925  }
1926  case AMDGPU::S_MOV_B32:
1927  case AMDGPU::S_MOV_B64:
1928  case AMDGPU::COPY:
1929  return true;
1930  default:
1931  return false;
1932  }
1933 }
1934 
1937  switch(Kind) {
1940  return ST.getAMDGPUAS().PRIVATE_ADDRESS;
1947  return ST.getAMDGPUAS().CONSTANT_ADDRESS;
1948  }
1949  return ST.getAMDGPUAS().FLAT_ADDRESS;
1950 }
1951 
1953  unsigned Opc = MI.getOpcode();
1954  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1955  AMDGPU::OpName::src0_modifiers);
1956  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1957  AMDGPU::OpName::src1_modifiers);
1958  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1959  AMDGPU::OpName::src2_modifiers);
1960 
1961  MI.RemoveOperand(Src2ModIdx);
1962  MI.RemoveOperand(Src1ModIdx);
1963  MI.RemoveOperand(Src0ModIdx);
1964 }
1965 
1967  unsigned Reg, MachineRegisterInfo *MRI) const {
1968  if (!MRI->hasOneNonDBGUse(Reg))
1969  return false;
1970 
1971  switch (DefMI.getOpcode()) {
1972  default:
1973  return false;
1974  case AMDGPU::S_MOV_B64:
1975  // TODO: We could fold 64-bit immediates, but this get compilicated
1976  // when there are sub-registers.
1977  return false;
1978 
1979  case AMDGPU::V_MOV_B32_e32:
1980  case AMDGPU::S_MOV_B32:
1981  break;
1982  }
1983 
1984  const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
1985  assert(ImmOp);
1986  // FIXME: We could handle FrameIndex values here.
1987  if (!ImmOp->isImm())
1988  return false;
1989 
1990  unsigned Opc = UseMI.getOpcode();
1991  if (Opc == AMDGPU::COPY) {
1992  bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
1993  unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1994  UseMI.setDesc(get(NewOpc));
1995  UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
1996  UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
1997  return true;
1998  }
1999 
2000  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2001  Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
2002  // Don't fold if we are using source or output modifiers. The new VOP2
2003  // instructions don't have them.
2004  if (hasAnyModifiersSet(UseMI))
2005  return false;
2006 
2007  // If this is a free constant, there's no reason to do this.
2008  // TODO: We could fold this here instead of letting SIFoldOperands do it
2009  // later.
2010  MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2011 
2012  // Any src operand can be used for the legality check.
2013  if (isInlineConstant(UseMI, *Src0, *ImmOp))
2014  return false;
2015 
2016  bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
2017  MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2018  MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2019 
2020  // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2021  // We should only expect these to be on src0 due to canonicalizations.
2022  if (Src0->isReg() && Src0->getReg() == Reg) {
2023  if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2024  return false;
2025 
2026  if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2027  return false;
2028 
2029  // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2030 
2031  const int64_t Imm = ImmOp->getImm();
2032 
2033  // FIXME: This would be a lot easier if we could return a new instruction
2034  // instead of having to modify in place.
2035 
2036  // Remove these first since they are at the end.
2037  UseMI.RemoveOperand(
2038  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2039  UseMI.RemoveOperand(
2040  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2041 
2042  unsigned Src1Reg = Src1->getReg();
2043  unsigned Src1SubReg = Src1->getSubReg();
2044  Src0->setReg(Src1Reg);
2045  Src0->setSubReg(Src1SubReg);
2046  Src0->setIsKill(Src1->isKill());
2047 
2048  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2049  Opc == AMDGPU::V_MAC_F16_e64)
2050  UseMI.untieRegOperand(
2051  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2052 
2053  Src1->ChangeToImmediate(Imm);
2054 
2055  removeModOperands(UseMI);
2056  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
2057 
2058  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2059  if (DeleteDef)
2060  DefMI.eraseFromParent();
2061 
2062  return true;
2063  }
2064 
2065  // Added part is the constant: Use v_madak_{f16, f32}.
2066  if (Src2->isReg() && Src2->getReg() == Reg) {
2067  // Not allowed to use constant bus for another operand.
2068  // We can however allow an inline immediate as src0.
2069  if (!Src0->isImm() &&
2070  (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
2071  return false;
2072 
2073  if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2074  return false;
2075 
2076  const int64_t Imm = ImmOp->getImm();
2077 
2078  // FIXME: This would be a lot easier if we could return a new instruction
2079  // instead of having to modify in place.
2080 
2081  // Remove these first since they are at the end.
2082  UseMI.RemoveOperand(
2083  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2084  UseMI.RemoveOperand(
2085  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2086 
2087  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2088  Opc == AMDGPU::V_MAC_F16_e64)
2089  UseMI.untieRegOperand(
2090  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2091 
2092  // ChangingToImmediate adds Src2 back to the instruction.
2093  Src2->ChangeToImmediate(Imm);
2094 
2095  // These come before src2.
2096  removeModOperands(UseMI);
2097  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
2098 
2099  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2100  if (DeleteDef)
2101  DefMI.eraseFromParent();
2102 
2103  return true;
2104  }
2105  }
2106 
2107  return false;
2108 }
2109 
2110 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2111  int WidthB, int OffsetB) {
2112  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2113  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2114  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2115  return LowOffset + LowWidth <= HighOffset;
2116 }
2117 
2118 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
2119  MachineInstr &MIb) const {
2120  unsigned BaseReg0, BaseReg1;
2121  int64_t Offset0, Offset1;
2122 
2123  if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
2124  getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
2125 
2126  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2127  // FIXME: Handle ds_read2 / ds_write2.
2128  return false;
2129  }
2130  unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2131  unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2132  if (BaseReg0 == BaseReg1 &&
2133  offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
2134  return true;
2135  }
2136  }
2137 
2138  return false;
2139 }
2140 
2142  MachineInstr &MIb,
2143  AliasAnalysis *AA) const {
2144  assert((MIa.mayLoad() || MIa.mayStore()) &&
2145  "MIa must load from or modify a memory location");
2146  assert((MIb.mayLoad() || MIb.mayStore()) &&
2147  "MIb must load from or modify a memory location");
2148 
2150  return false;
2151 
2152  // XXX - Can we relax this between address spaces?
2153  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2154  return false;
2155 
2156  if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
2157  const MachineMemOperand *MMOa = *MIa.memoperands_begin();
2158  const MachineMemOperand *MMOb = *MIb.memoperands_begin();
2159  if (MMOa->getValue() && MMOb->getValue()) {
2160  MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
2161  MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
2162  if (!AA->alias(LocA, LocB))
2163  return true;
2164  }
2165  }
2166 
2167  // TODO: Should we check the address space from the MachineMemOperand? That
2168  // would allow us to distinguish objects we know don't alias based on the
2169  // underlying address space, even if it was lowered to a different one,
2170  // e.g. private accesses lowered to use MUBUF instructions on a scratch
2171  // buffer.
2172  if (isDS(MIa)) {
2173  if (isDS(MIb))
2174  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2175 
2176  return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
2177  }
2178 
2179  if (isMUBUF(MIa) || isMTBUF(MIa)) {
2180  if (isMUBUF(MIb) || isMTBUF(MIb))
2181  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2182 
2183  return !isFLAT(MIb) && !isSMRD(MIb);
2184  }
2185 
2186  if (isSMRD(MIa)) {
2187  if (isSMRD(MIb))
2188  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2189 
2190  return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2191  }
2192 
2193  if (isFLAT(MIa)) {
2194  if (isFLAT(MIb))
2195  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2196 
2197  return false;
2198  }
2199 
2200  return false;
2201 }
2202 
2203 static int64_t getFoldableImm(const MachineOperand* MO) {
2204  if (!MO->isReg())
2205  return false;
2206  const MachineFunction *MF = MO->getParent()->getParent()->getParent();
2207  const MachineRegisterInfo &MRI = MF->getRegInfo();
2208  auto Def = MRI.getUniqueVRegDef(MO->getReg());
2209  if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
2210  Def->getOperand(1).isImm())
2211  return Def->getOperand(1).getImm();
2212  return AMDGPU::NoRegister;
2213 }
2214 
2216  MachineInstr &MI,
2217  LiveVariables *LV) const {
2218  unsigned Opc = MI.getOpcode();
2219  bool IsF16 = false;
2220  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
2221 
2222  switch (Opc) {
2223  default:
2224  return nullptr;
2225  case AMDGPU::V_MAC_F16_e64:
2226  IsF16 = true;
2228  case AMDGPU::V_MAC_F32_e64:
2229  case AMDGPU::V_FMAC_F32_e64:
2230  break;
2231  case AMDGPU::V_MAC_F16_e32:
2232  IsF16 = true;
2234  case AMDGPU::V_MAC_F32_e32:
2235  case AMDGPU::V_FMAC_F32_e32: {
2236  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2237  AMDGPU::OpName::src0);
2238  const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2239  if (!Src0->isReg() && !Src0->isImm())
2240  return nullptr;
2241 
2242  if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2243  return nullptr;
2244 
2245  break;
2246  }
2247  }
2248 
2249  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2250  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2251  const MachineOperand *Src0Mods =
2252  getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2253  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2254  const MachineOperand *Src1Mods =
2255  getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2256  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2257  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2258  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2259 
2260  if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
2261  // If we have an SGPR input, we will violate the constant bus restriction.
2262  (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
2263  if (auto Imm = getFoldableImm(Src2)) {
2264  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2265  get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
2266  .add(*Dst)
2267  .add(*Src0)
2268  .add(*Src1)
2269  .addImm(Imm);
2270  }
2271  if (auto Imm = getFoldableImm(Src1)) {
2272  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2273  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2274  .add(*Dst)
2275  .add(*Src0)
2276  .addImm(Imm)
2277  .add(*Src2);
2278  }
2279  if (auto Imm = getFoldableImm(Src0)) {
2280  if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
2281  AMDGPU::OpName::src0), Src1))
2282  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2283  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2284  .add(*Dst)
2285  .add(*Src1)
2286  .addImm(Imm)
2287  .add(*Src2);
2288  }
2289  }
2290 
2291  assert((!IsFMA || !IsF16) && "fmac only expected with f32");
2292  unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
2293  (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2294  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2295  .add(*Dst)
2296  .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2297  .add(*Src0)
2298  .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2299  .add(*Src1)
2300  .addImm(0) // Src mods
2301  .add(*Src2)
2302  .addImm(Clamp ? Clamp->getImm() : 0)
2303  .addImm(Omod ? Omod->getImm() : 0);
2304 }
2305 
2306 // It's not generally safe to move VALU instructions across these since it will
2307 // start using the register as a base index rather than directly.
2308 // XXX - Why isn't hasSideEffects sufficient for these?
2310  switch (MI.getOpcode()) {
2311  case AMDGPU::S_SET_GPR_IDX_ON:
2312  case AMDGPU::S_SET_GPR_IDX_MODE:
2313  case AMDGPU::S_SET_GPR_IDX_OFF:
2314  return true;
2315  default:
2316  return false;
2317  }
2318 }
2319 
2321  const MachineBasicBlock *MBB,
2322  const MachineFunction &MF) const {
2323  // XXX - Do we want the SP check in the base implementation?
2324 
2325  // Target-independent instructions do not have an implicit-use of EXEC, even
2326  // when they operate on VGPRs. Treating EXEC modifications as scheduling
2327  // boundaries prevents incorrect movements of such instructions.
2328  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2329  MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2330  MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2331  MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2333 }
2334 
2336  unsigned Opcode = MI.getOpcode();
2337 
2338  if (MI.mayStore() && isSMRD(MI))
2339  return true; // scalar store or atomic
2340 
2341  // These instructions cause shader I/O that may cause hardware lockups
2342  // when executed with an empty EXEC mask.
2343  //
2344  // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
2345  // EXEC = 0, but checking for that case here seems not worth it
2346  // given the typical code patterns.
2347  if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
2348  Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE)
2349  return true;
2350 
2351  if (MI.isInlineAsm())
2352  return true; // conservative assumption
2353 
2354  // These are like SALU instructions in terms of effects, so it's questionable
2355  // whether we should return true for those.
2356  //
2357  // However, executing them with EXEC = 0 causes them to operate on undefined
2358  // data, which we avoid by returning true here.
2359  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
2360  return true;
2361 
2362  return false;
2363 }
2364 
2365 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2366  switch (Imm.getBitWidth()) {
2367  case 32:
2369  ST.hasInv2PiInlineImm());
2370  case 64:
2372  ST.hasInv2PiInlineImm());
2373  case 16:
2374  return ST.has16BitInsts() &&
2376  ST.hasInv2PiInlineImm());
2377  default:
2378  llvm_unreachable("invalid bitwidth");
2379  }
2380 }
2381 
2383  uint8_t OperandType) const {
2384  if (!MO.isImm() ||
2385  OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2386  OperandType > AMDGPU::OPERAND_SRC_LAST)
2387  return false;
2388 
2389  // MachineOperand provides no way to tell the true operand size, since it only
2390  // records a 64-bit value. We need to know the size to determine if a 32-bit
2391  // floating point immediate bit pattern is legal for an integer immediate. It
2392  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2393 
2394  int64_t Imm = MO.getImm();
2395  switch (OperandType) {
2400  int32_t Trunc = static_cast<int32_t>(Imm);
2401  return Trunc == Imm &&
2403  }
2409  ST.hasInv2PiInlineImm());
2414  if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2415  // A few special case instructions have 16-bit operands on subtargets
2416  // where 16-bit instructions are not legal.
2417  // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2418  // constants in these cases
2419  int16_t Trunc = static_cast<int16_t>(Imm);
2420  return ST.has16BitInsts() &&
2422  }
2423 
2424  return false;
2425  }
2428  if (isUInt<16>(Imm)) {
2429  int16_t Trunc = static_cast<int16_t>(Imm);
2430  return ST.has16BitInsts() &&
2432  }
2433  if (!(Imm & 0xffff)) {
2434  return ST.has16BitInsts() &&
2436  }
2437  uint32_t Trunc = static_cast<uint32_t>(Imm);
2439  }
2440  default:
2441  llvm_unreachable("invalid bitwidth");
2442  }
2443 }
2444 
2446  const MCOperandInfo &OpInfo) const {
2447  switch (MO.getType()) {
2449  return false;
2451  return !isInlineConstant(MO, OpInfo);
2457  return true;
2458  default:
2459  llvm_unreachable("unexpected operand type");
2460  }
2461 }
2462 
2463 static bool compareMachineOp(const MachineOperand &Op0,
2464  const MachineOperand &Op1) {
2465  if (Op0.getType() != Op1.getType())
2466  return false;
2467 
2468  switch (Op0.getType()) {
2470  return Op0.getReg() == Op1.getReg();
2472  return Op0.getImm() == Op1.getImm();
2473  default:
2474  llvm_unreachable("Didn't expect to be comparing these operand types");
2475  }
2476 }
2477 
2479  const MachineOperand &MO) const {
2480  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2481 
2482  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2483 
2484  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2485  return true;
2486 
2487  if (OpInfo.RegClass < 0)
2488  return false;
2489 
2490  if (MO.isImm() && isInlineConstant(MO, OpInfo))
2491  return RI.opCanUseInlineConstant(OpInfo.OperandType);
2492 
2493  return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2494 }
2495 
2496 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2497  int Op32 = AMDGPU::getVOPe32(Opcode);
2498  if (Op32 == -1)
2499  return false;
2500 
2501  return pseudoToMCOpcode(Op32) != -1;
2502 }
2503 
2504 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2505  // The src0_modifier operand is present on all instructions
2506  // that have modifiers.
2507 
2508  return AMDGPU::getNamedOperandIdx(Opcode,
2509  AMDGPU::OpName::src0_modifiers) != -1;
2510 }
2511 
2513  unsigned OpName) const {
2514  const MachineOperand *Mods = getNamedOperand(MI, OpName);
2515  return Mods && Mods->getImm();
2516 }
2517 
2519  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2520  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2521  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2522  hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2523  hasModifiersSet(MI, AMDGPU::OpName::omod);
2524 }
2525 
2527  const MachineOperand &MO,
2528  const MCOperandInfo &OpInfo) const {
2529  // Literal constants use the constant bus.
2530  //if (isLiteralConstantLike(MO, OpInfo))
2531  // return true;
2532  if (MO.isImm())
2533  return !isInlineConstant(MO, OpInfo);
2534 
2535  if (!MO.isReg())
2536  return true; // Misc other operands like FrameIndex
2537 
2538  if (!MO.isUse())
2539  return false;
2540 
2542  return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2543 
2544  // FLAT_SCR is just an SGPR pair.
2545  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2546  return true;
2547 
2548  // EXEC register uses the constant bus.
2549  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2550  return true;
2551 
2552  // SGPRs use the constant bus
2553  return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2554  (!MO.isImplicit() &&
2555  (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2556  AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2557 }
2558 
2559 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2560  for (const MachineOperand &MO : MI.implicit_operands()) {
2561  // We only care about reads.
2562  if (MO.isDef())
2563  continue;
2564 
2565  switch (MO.getReg()) {
2566  case AMDGPU::VCC:
2567  case AMDGPU::M0:
2568  case AMDGPU::FLAT_SCR:
2569  return MO.getReg();
2570 
2571  default:
2572  break;
2573  }
2574  }
2575 
2576  return AMDGPU::NoRegister;
2577 }
2578 
2579 static bool shouldReadExec(const MachineInstr &MI) {
2580  if (SIInstrInfo::isVALU(MI)) {
2581  switch (MI.getOpcode()) {
2582  case AMDGPU::V_READLANE_B32:
2583  case AMDGPU::V_READLANE_B32_si:
2584  case AMDGPU::V_READLANE_B32_vi:
2585  case AMDGPU::V_WRITELANE_B32:
2586  case AMDGPU::V_WRITELANE_B32_si:
2587  case AMDGPU::V_WRITELANE_B32_vi:
2588  return false;
2589  }
2590 
2591  return true;
2592  }
2593 
2594  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2595  SIInstrInfo::isSALU(MI) ||
2596  SIInstrInfo::isSMRD(MI))
2597  return false;
2598 
2599  return true;
2600 }
2601 
2602 static bool isSubRegOf(const SIRegisterInfo &TRI,
2603  const MachineOperand &SuperVec,
2604  const MachineOperand &SubReg) {
2606  return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2607 
2608  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2609  SubReg.getReg() == SuperVec.getReg();
2610 }
2611 
2613  StringRef &ErrInfo) const {
2614  uint16_t Opcode = MI.getOpcode();
2615  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2616  return true;
2617 
2618  const MachineFunction *MF = MI.getParent()->getParent();
2619  const MachineRegisterInfo &MRI = MF->getRegInfo();
2620 
2621  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2622  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2623  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2624 
2625  // Make sure the number of operands is correct.
2626  const MCInstrDesc &Desc = get(Opcode);
2627  if (!Desc.isVariadic() &&
2628  Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2629  ErrInfo = "Instruction has wrong number of operands.";
2630  return false;
2631  }
2632 
2633  if (MI.isInlineAsm()) {
2634  // Verify register classes for inlineasm constraints.
2635  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2636  I != E; ++I) {
2637  const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2638  if (!RC)
2639  continue;
2640 
2641  const MachineOperand &Op = MI.getOperand(I);
2642  if (!Op.isReg())
2643  continue;
2644 
2645  unsigned Reg = Op.getReg();
2646  if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2647  ErrInfo = "inlineasm operand has incorrect register class.";
2648  return false;
2649  }
2650  }
2651 
2652  return true;
2653  }
2654 
2655  // Make sure the register classes are correct.
2656  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2657  if (MI.getOperand(i).isFPImm()) {
2658  ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2659  "all fp values to integers.";
2660  return false;
2661  }
2662 
2663  int RegClass = Desc.OpInfo[i].RegClass;
2664 
2665  switch (Desc.OpInfo[i].OperandType) {
2667  if (MI.getOperand(i).isImm()) {
2668  ErrInfo = "Illegal immediate value for operand.";
2669  return false;
2670  }
2671  break;
2674  break;
2681  const MachineOperand &MO = MI.getOperand(i);
2682  if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2683  ErrInfo = "Illegal immediate value for operand.";
2684  return false;
2685  }
2686  break;
2687  }
2690  // Check if this operand is an immediate.
2691  // FrameIndex operands will be replaced by immediates, so they are
2692  // allowed.
2693  if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2694  ErrInfo = "Expected immediate, but got non-immediate";
2695  return false;
2696  }
2698  default:
2699  continue;
2700  }
2701 
2702  if (!MI.getOperand(i).isReg())
2703  continue;
2704 
2705  if (RegClass != -1) {
2706  unsigned Reg = MI.getOperand(i).getReg();
2707  if (Reg == AMDGPU::NoRegister ||
2709  continue;
2710 
2711  const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2712  if (!RC->contains(Reg)) {
2713  ErrInfo = "Operand has incorrect register class.";
2714  return false;
2715  }
2716  }
2717  }
2718 
2719  // Verify SDWA
2720  if (isSDWA(MI)) {
2721  if (!ST.hasSDWA()) {
2722  ErrInfo = "SDWA is not supported on this target";
2723  return false;
2724  }
2725 
2726  int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2727 
2728  const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2729 
2730  for (int OpIdx: OpIndicies) {
2731  if (OpIdx == -1)
2732  continue;
2733  const MachineOperand &MO = MI.getOperand(OpIdx);
2734 
2735  if (!ST.hasSDWAScalar()) {
2736  // Only VGPRS on VI
2737  if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
2738  ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2739  return false;
2740  }
2741  } else {
2742  // No immediates on GFX9
2743  if (!MO.isReg()) {
2744  ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2745  return false;
2746  }
2747  }
2748  }
2749 
2750  if (!ST.hasSDWAOmod()) {
2751  // No omod allowed on VI
2752  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2753  if (OMod != nullptr &&
2754  (!OMod->isImm() || OMod->getImm() != 0)) {
2755  ErrInfo = "OMod not allowed in SDWA instructions on VI";
2756  return false;
2757  }
2758  }
2759 
2760  uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
2761  if (isVOPC(BasicOpcode)) {
2762  if (!ST.hasSDWASdst() && DstIdx != -1) {
2763  // Only vcc allowed as dst on VI for VOPC
2764  const MachineOperand &Dst = MI.getOperand(DstIdx);
2765  if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
2766  ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
2767  return false;
2768  }
2769  } else if (!ST.hasSDWAOutModsVOPC()) {
2770  // No clamp allowed on GFX9 for VOPC
2771  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2772  if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
2773  ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
2774  return false;
2775  }
2776 
2777  // No omod allowed on GFX9 for VOPC
2778  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2779  if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
2780  ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
2781  return false;
2782  }
2783  }
2784  }
2785 
2786  const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
2787  if (DstUnused && DstUnused->isImm() &&
2788  DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
2789  const MachineOperand &Dst = MI.getOperand(DstIdx);
2790  if (!Dst.isReg() || !Dst.isTied()) {
2791  ErrInfo = "Dst register should have tied register";
2792  return false;
2793  }
2794 
2795  const MachineOperand &TiedMO =
2796  MI.getOperand(MI.findTiedOperandIdx(DstIdx));
2797  if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
2798  ErrInfo =
2799  "Dst register should be tied to implicit use of preserved register";
2800  return false;
2801  } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
2802  Dst.getReg() != TiedMO.getReg()) {
2803  ErrInfo = "Dst register should use same physical register as preserved";
2804  return false;
2805  }
2806  }
2807  }
2808 
2809  // Verify VOP*. Ignore multiple sgpr operands on writelane.
2810  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
2811  && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
2812  // Only look at the true operands. Only a real operand can use the constant
2813  // bus, and we don't want to check pseudo-operands like the source modifier
2814  // flags.
2815  const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
2816 
2817  unsigned ConstantBusCount = 0;
2818  unsigned LiteralCount = 0;
2819 
2820  if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
2821  ++ConstantBusCount;
2822 
2823  unsigned SGPRUsed = findImplicitSGPRRead(MI);
2824  if (SGPRUsed != AMDGPU::NoRegister)
2825  ++ConstantBusCount;
2826 
2827  for (int OpIdx : OpIndices) {
2828  if (OpIdx == -1)
2829  break;
2830  const MachineOperand &MO = MI.getOperand(OpIdx);
2831  if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
2832  if (MO.isReg()) {
2833  if (MO.getReg() != SGPRUsed)
2834  ++ConstantBusCount;
2835  SGPRUsed = MO.getReg();
2836  } else {
2837  ++ConstantBusCount;
2838  ++LiteralCount;
2839  }
2840  }
2841  }
2842  if (ConstantBusCount > 1) {
2843  ErrInfo = "VOP* instruction uses the constant bus more than once";
2844  return false;
2845  }
2846 
2847  if (isVOP3(MI) && LiteralCount) {
2848  ErrInfo = "VOP3 instruction uses literal";
2849  return false;
2850  }
2851  }
2852 
2853  // Verify misc. restrictions on specific instructions.
2854  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
2855  Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
2856  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2857  const MachineOperand &Src1 = MI.getOperand(Src1Idx);
2858  const MachineOperand &Src2 = MI.getOperand(Src2Idx);
2859  if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
2860  if (!compareMachineOp(Src0, Src1) &&
2861  !compareMachineOp(Src0, Src2)) {
2862  ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
2863  return false;
2864  }
2865  }
2866  }
2867 
2868  if (isSOPK(MI)) {
2869  int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
2870  if (sopkIsZext(MI)) {
2871  if (!isUInt<16>(Imm)) {
2872  ErrInfo = "invalid immediate for SOPK instruction";
2873  return false;
2874  }
2875  } else {
2876  if (!isInt<16>(Imm)) {
2877  ErrInfo = "invalid immediate for SOPK instruction";
2878  return false;
2879  }
2880  }
2881  }
2882 
2883  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
2884  Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
2885  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2886  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
2887  const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2888  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
2889 
2890  const unsigned StaticNumOps = Desc.getNumOperands() +
2891  Desc.getNumImplicitUses();
2892  const unsigned NumImplicitOps = IsDst ? 2 : 1;
2893 
2894  // Allow additional implicit operands. This allows a fixup done by the post
2895  // RA scheduler where the main implicit operand is killed and implicit-defs
2896  // are added for sub-registers that remain live after this instruction.
2897  if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
2898  ErrInfo = "missing implicit register operands";
2899  return false;
2900  }
2901 
2902  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2903  if (IsDst) {
2904  if (!Dst->isUse()) {
2905  ErrInfo = "v_movreld_b32 vdst should be a use operand";
2906  return false;
2907  }
2908 
2909  unsigned UseOpIdx;
2910  if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
2911  UseOpIdx != StaticNumOps + 1) {
2912  ErrInfo = "movrel implicit operands should be tied";
2913  return false;
2914  }
2915  }
2916 
2917  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2918  const MachineOperand &ImpUse
2919  = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
2920  if (!ImpUse.isReg() || !ImpUse.isUse() ||
2921  !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
2922  ErrInfo = "src0 should be subreg of implicit vector use";
2923  return false;
2924  }
2925  }
2926 
2927  // Make sure we aren't losing exec uses in the td files. This mostly requires
2928  // being careful when using let Uses to try to add other use registers.
2929  if (shouldReadExec(MI)) {
2930  if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
2931  ErrInfo = "VALU instruction does not implicitly read exec mask";
2932  return false;
2933  }
2934  }
2935 
2936  if (isSMRD(MI)) {
2937  if (MI.mayStore()) {
2938  // The register offset form of scalar stores may only use m0 as the
2939  // soffset register.
2940  const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
2941  if (Soff && Soff->getReg() != AMDGPU::M0) {
2942  ErrInfo = "scalar stores must use m0 as offset register";
2943  return false;
2944  }
2945  }
2946  }
2947 
2948  if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
2949  const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
2950  if (Offset->getImm() != 0) {
2951  ErrInfo = "subtarget does not support offsets in flat instructions";
2952  return false;
2953  }
2954  }
2955 
2956  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
2957  if (DppCt) {
2958  using namespace AMDGPU::DPP;
2959 
2960  unsigned DC = DppCt->getImm();
2961  if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
2962  DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
2967  ErrInfo = "Invalid dpp_ctrl value";
2968  return false;
2969  }
2970  }
2971 
2972  return true;
2973 }
2974 
2975 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
2976  switch (MI.getOpcode()) {
2977  default: return AMDGPU::INSTRUCTION_LIST_END;
2978  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
2979  case AMDGPU::COPY: return AMDGPU::COPY;
2980  case AMDGPU::PHI: return AMDGPU::PHI;
2981  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
2982  case AMDGPU::WQM: return AMDGPU::WQM;
2983  case AMDGPU::WWM: return AMDGPU::WWM;
2984  case AMDGPU::S_MOV_B32:
2985  return MI.getOperand(1).isReg() ?
2986  AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
2987  case AMDGPU::S_ADD_I32:
2988  return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
2989  case AMDGPU::S_ADDC_U32:
2990  return AMDGPU::V_ADDC_U32_e32;
2991  case AMDGPU::S_SUB_I32:
2992  return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
2993  // FIXME: These are not consistently handled, and selected when the carry is
2994  // used.
2995  case AMDGPU::S_ADD_U32:
2996  return AMDGPU::V_ADD_I32_e32;
2997  case AMDGPU::S_SUB_U32:
2998  return AMDGPU::V_SUB_I32_e32;
2999  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
3000  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
3001  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
3002  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
3003  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
3004  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
3005  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
3006  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
3007  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
3008  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
3009  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
3010  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
3011  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
3012  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
3013  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
3014  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
3015  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
3016  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
3017  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
3018  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
3019  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
3020  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
3021  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
3022  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
3023  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
3024  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
3025  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
3026  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
3027  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
3028  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
3029  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
3030  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
3031  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
3032  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
3033  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
3034  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
3035  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
3036  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
3037  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
3038  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
3039  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
3040  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
3041  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
3042  }
3043 }
3044 
3046  unsigned OpNo) const {
3048  const MCInstrDesc &Desc = get(MI.getOpcode());
3049  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
3050  Desc.OpInfo[OpNo].RegClass == -1) {
3051  unsigned Reg = MI.getOperand(OpNo).getReg();
3052 
3054  return MRI.getRegClass(Reg);
3055  return RI.getPhysRegClass(Reg);
3056  }
3057 
3058  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
3059  return RI.getRegClass(RCID);
3060 }
3061 
3062 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
3063  switch (MI.getOpcode()) {
3064  case AMDGPU::COPY:
3065  case AMDGPU::REG_SEQUENCE:
3066  case AMDGPU::PHI:
3067  case AMDGPU::INSERT_SUBREG:
3068  return RI.hasVGPRs(getOpRegClass(MI, 0));
3069  default:
3070  return RI.hasVGPRs(getOpRegClass(MI, OpNo));
3071  }
3072 }
3073 
3074 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
3076  MachineBasicBlock *MBB = MI.getParent();
3077  MachineOperand &MO = MI.getOperand(OpIdx);
3079  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
3080  const TargetRegisterClass *RC = RI.getRegClass(RCID);
3081  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
3082  if (MO.isReg())
3083  Opcode = AMDGPU::COPY;
3084  else if (RI.isSGPRClass(RC))
3085  Opcode = AMDGPU::S_MOV_B32;
3086 
3087  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
3088  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
3089  VRC = &AMDGPU::VReg_64RegClass;
3090  else
3091  VRC = &AMDGPU::VGPR_32RegClass;
3092 
3093  unsigned Reg = MRI.createVirtualRegister(VRC);
3094  DebugLoc DL = MBB->findDebugLoc(I);
3095  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
3096  MO.ChangeToRegister(Reg, false);
3097 }
3098 
3101  MachineOperand &SuperReg,
3102  const TargetRegisterClass *SuperRC,
3103  unsigned SubIdx,
3104  const TargetRegisterClass *SubRC)
3105  const {
3106  MachineBasicBlock *MBB = MI->getParent();
3107  DebugLoc DL = MI->getDebugLoc();
3108  unsigned SubReg = MRI.createVirtualRegister(SubRC);
3109 
3110  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
3111  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3112  .addReg(SuperReg.getReg(), 0, SubIdx);
3113  return SubReg;
3114  }
3115 
3116  // Just in case the super register is itself a sub-register, copy it to a new
3117  // value so we don't need to worry about merging its subreg index with the
3118  // SubIdx passed to this function. The register coalescer should be able to
3119  // eliminate this extra copy.
3120  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
3121 
3122  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
3123  .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
3124 
3125  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3126  .addReg(NewSuperReg, 0, SubIdx);
3127 
3128  return SubReg;
3129 }
3130 
3134  MachineOperand &Op,
3135  const TargetRegisterClass *SuperRC,
3136  unsigned SubIdx,
3137  const TargetRegisterClass *SubRC) const {
3138  if (Op.isImm()) {
3139  if (SubIdx == AMDGPU::sub0)
3140  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
3141  if (SubIdx == AMDGPU::sub1)
3142  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
3143 
3144  llvm_unreachable("Unhandled register index for immediate");
3145  }
3146 
3147  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
3148  SubIdx, SubRC);
3149  return MachineOperand::CreateReg(SubReg, false);
3150 }
3151 
3152 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
3153 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
3154  assert(Inst.getNumExplicitOperands() == 3);
3155  MachineOperand Op1 = Inst.getOperand(1);
3156  Inst.RemoveOperand(1);
3157  Inst.addOperand(Op1);
3158 }
3159 
3161  const MCOperandInfo &OpInfo,
3162  const MachineOperand &MO) const {
3163  if (!MO.isReg())
3164  return false;
3165 
3166  unsigned Reg = MO.getReg();
3167  const TargetRegisterClass *RC =
3169  MRI.getRegClass(Reg) :
3170  RI.getPhysRegClass(Reg);
3171 
3172  const SIRegisterInfo *TRI =
3173  static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3174  RC = TRI->getSubRegClass(RC, MO.getSubReg());
3175 
3176  // In order to be legal, the common sub-class must be equal to the
3177  // class of the current operand. For example:
3178  //
3179  // v_mov_b32 s0 ; Operand defined as vsrc_b32
3180  // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
3181  //
3182  // s_sendmsg 0, s0 ; Operand defined as m0reg
3183  // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
3184 
3185  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
3186 }
3187 
3189  const MCOperandInfo &OpInfo,
3190  const MachineOperand &MO) const {
3191  if (MO.isReg())
3192  return isLegalRegOperand(MRI, OpInfo, MO);
3193 
3194  // Handle non-register types that are treated like immediates.
3195  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
3196  return true;
3197 }
3198 
3199 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
3200  const MachineOperand *MO) const {
3202  const MCInstrDesc &InstDesc = MI.getDesc();
3203  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
3204  const TargetRegisterClass *DefinedRC =
3205  OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
3206  if (!MO)
3207  MO = &MI.getOperand(OpIdx);
3208 
3209  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
3210 
3211  RegSubRegPair SGPRUsed;
3212  if (MO->isReg())
3213  SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
3214 
3215  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3216  if (i == OpIdx)
3217  continue;
3218  const MachineOperand &Op = MI.getOperand(i);
3219  if (Op.isReg()) {
3220  if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
3221  usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
3222  return false;
3223  }
3224  } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
3225  return false;
3226  }
3227  }
3228  }
3229 
3230  if (MO->isReg()) {
3231  assert(DefinedRC);
3232  return isLegalRegOperand(MRI, OpInfo, *MO);
3233  }
3234 
3235  // Handle non-register types that are treated like immediates.
3236  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
3237 
3238  if (!DefinedRC) {
3239  // This operand expects an immediate.
3240  return true;
3241  }
3242 
3243  return isImmOperandLegal(MI, OpIdx, *MO);
3244 }
3245 
3247  MachineInstr &MI) const {
3248  unsigned Opc = MI.getOpcode();
3249  const MCInstrDesc &InstrDesc = get(Opc);
3250 
3251  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3252  MachineOperand &Src1 = MI.getOperand(Src1Idx);
3253 
3254  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
3255  // we need to only have one constant bus use.
3256  //
3257  // Note we do not need to worry about literal constants here. They are
3258  // disabled for the operand type for instructions because they will always
3259  // violate the one constant bus use rule.
3260  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3261  if (HasImplicitSGPR) {
3262  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3263  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3264 
3265  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
3266  legalizeOpWithMove(MI, Src0Idx);
3267  }
3268 
3269  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
3270  // both the value to write (src0) and lane select (src1). Fix up non-SGPR
3271  // src0/src1 with V_READFIRSTLANE.
3272  if (Opc == AMDGPU::V_WRITELANE_B32) {
3273  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3274  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3275  const DebugLoc &DL = MI.getDebugLoc();
3276  if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
3277  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3278  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3279  .add(Src0);
3280  Src0.ChangeToRegister(Reg, false);
3281  }
3282  if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
3283  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3284  const DebugLoc &DL = MI.getDebugLoc();
3285  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3286  .add(Src1);
3287  Src1.ChangeToRegister(Reg, false);
3288  }
3289  return;
3290  }
3291 
3292  // VOP2 src0 instructions support all operand types, so we don't need to check
3293  // their legality. If src1 is already legal, we don't need to do anything.
3294  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3295  return;
3296 
3297  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3298  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3299  // select is uniform.
3300  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
3301  RI.isVGPR(MRI, Src1.getReg())) {
3302  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3303  const DebugLoc &DL = MI.getDebugLoc();
3304  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3305  .add(Src1);
3306  Src1.ChangeToRegister(Reg, false);
3307  return;
3308  }
3309 
3310  // We do not use commuteInstruction here because it is too aggressive and will
3311  // commute if it is possible. We only want to commute here if it improves
3312  // legality. This can be called a fairly large number of times so don't waste
3313  // compile time pointlessly swapping and checking legality again.
3314  if (HasImplicitSGPR || !MI.isCommutable()) {
3315  legalizeOpWithMove(MI, Src1Idx);
3316  return;
3317  }
3318 
3319  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3320  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3321 
3322  // If src0 can be used as src1, commuting will make the operands legal.
3323  // Otherwise we have to give up and insert a move.
3324  //
3325  // TODO: Other immediate-like operand kinds could be commuted if there was a
3326  // MachineOperand::ChangeTo* for them.
3327  if ((!Src1.isImm() && !Src1.isReg()) ||
3328  !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
3329  legalizeOpWithMove(MI, Src1Idx);
3330  return;
3331  }
3332 
3333  int CommutedOpc = commuteOpcode(MI);
3334  if (CommutedOpc == -1) {
3335  legalizeOpWithMove(MI, Src1Idx);
3336  return;
3337  }
3338 
3339  MI.setDesc(get(CommutedOpc));
3340 
3341  unsigned Src0Reg = Src0.getReg();
3342  unsigned Src0SubReg = Src0.getSubReg();
3343  bool Src0Kill = Src0.isKill();
3344 
3345  if (Src1.isImm())
3346  Src0.ChangeToImmediate(Src1.getImm());
3347  else if (Src1.isReg()) {
3348  Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
3349  Src0.setSubReg(Src1.getSubReg());
3350  } else
3351  llvm_unreachable("Should only have register or immediate operands");
3352 
3353  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3354  Src1.setSubReg(Src0SubReg);
3355 }
3356 
3357 // Legalize VOP3 operands. Because all operand types are supported for any
3358 // operand, and since literal constants are not allowed and should never be
3359 // seen, we only need to worry about inserting copies if we use multiple SGPR
3360 // operands.
3362  MachineInstr &MI) const {
3363  unsigned Opc = MI.getOpcode();
3364 
3365  int VOP3Idx[3] = {
3366  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3367  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3368  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3369  };
3370 
3371  // Find the one SGPR operand we are allowed to use.
3372  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3373 
3374  for (unsigned i = 0; i < 3; ++i) {
3375  int Idx = VOP3Idx[i];
3376  if (Idx == -1)
3377  break;
3378  MachineOperand &MO = MI.getOperand(Idx);
3379 
3380  // We should never see a VOP3 instruction with an illegal immediate operand.
3381  if (!MO.isReg())
3382  continue;
3383 
3384  if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
3385  continue; // VGPRs are legal
3386 
3387  if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
3388  SGPRReg = MO.getReg();
3389  // We can use one SGPR in each VOP3 instruction.
3390  continue;
3391  }
3392 
3393  // If we make it this far, then the operand is not legal and we must
3394  // legalize it.
3395  legalizeOpWithMove(MI, Idx);
3396  }
3397 }
3398 
3400  MachineRegisterInfo &MRI) const {
3401  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3402  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3403  unsigned DstReg = MRI.createVirtualRegister(SRC);
3404  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3405 
3406  if (SubRegs == 1) {
3407  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3408  get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
3409  .addReg(SrcReg);
3410  return DstReg;
3411  }
3412 
3414  for (unsigned i = 0; i < SubRegs; ++i) {
3415  unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3416  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3417  get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3418  .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3419  SRegs.push_back(SGPR);
3420  }
3421 
3422  MachineInstrBuilder MIB =
3423  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3424  get(AMDGPU::REG_SEQUENCE), DstReg);
3425  for (unsigned i = 0; i < SubRegs; ++i) {
3426  MIB.addReg(SRegs[i]);
3427  MIB.addImm(RI.getSubRegFromChannel(i));
3428  }
3429  return DstReg;
3430 }
3431 
3433  MachineInstr &MI) const {
3434 
3435  // If the pointer is store in VGPRs, then we need to move them to
3436  // SGPRs using v_readfirstlane. This is safe because we only select
3437  // loads with uniform pointers to SMRD instruction so we know the
3438  // pointer value is uniform.
3439  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3440  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3441  unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3442  SBase->setReg(SGPR);
3443  }
3444 }
3445 
3448  const TargetRegisterClass *DstRC,
3449  MachineOperand &Op,
3451  const DebugLoc &DL) const {
3452  unsigned OpReg = Op.getReg();
3453  unsigned OpSubReg = Op.getSubReg();
3454 
3455  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3456  RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3457 
3458  // Check if operand is already the correct register class.
3459  if (DstRC == OpRC)
3460  return;
3461 
3462  unsigned DstReg = MRI.createVirtualRegister(DstRC);
3463  MachineInstr *Copy =
3464  BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3465 
3466  Op.setReg(DstReg);
3467  Op.setSubReg(0);
3468 
3469  MachineInstr *Def = MRI.getVRegDef(OpReg);
3470  if (!Def)
3471  return;
3472 
3473  // Try to eliminate the copy if it is copying an immediate value.
3474  if (Def->isMoveImmediate())
3475  FoldImmediate(*Copy, *Def, OpReg, &MRI);
3476 }
3477 
3479  MachineFunction &MF = *MI.getParent()->getParent();
3481 
3482  // Legalize VOP2
3483  if (isVOP2(MI) || isVOPC(MI)) {
3484  legalizeOperandsVOP2(MRI, MI);
3485  return;
3486  }
3487 
3488  // Legalize VOP3
3489  if (isVOP3(MI)) {
3490  legalizeOperandsVOP3(MRI, MI);
3491  return;
3492  }
3493 
3494  // Legalize SMRD
3495  if (isSMRD(MI)) {
3496  legalizeOperandsSMRD(MRI, MI);
3497  return;
3498  }
3499 
3500  // Legalize REG_SEQUENCE and PHI
3501  // The register class of the operands much be the same type as the register
3502  // class of the output.
3503  if (MI.getOpcode() == AMDGPU::PHI) {
3504  const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3505  for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
3506  if (!MI.getOperand(i).isReg() ||
3508  continue;
3509  const TargetRegisterClass *OpRC =
3510  MRI.getRegClass(MI.getOperand(i).getReg());
3511  if (RI.hasVGPRs(OpRC)) {
3512  VRC = OpRC;
3513  } else {
3514  SRC = OpRC;
3515  }
3516  }
3517 
3518  // If any of the operands are VGPR registers, then they all most be
3519  // otherwise we will create illegal VGPR->SGPR copies when legalizing
3520  // them.
3521  if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
3522  if (!VRC) {
3523  assert(SRC);
3524  VRC = RI.getEquivalentVGPRClass(SRC);
3525  }
3526  RC = VRC;
3527  } else {
3528  RC = SRC;
3529  }
3530 
3531  // Update all the operands so they have the same type.
3532  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3533  MachineOperand &Op = MI.getOperand(I);
3535  continue;
3536 
3537  // MI is a PHI instruction.
3538  MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3539  MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3540 
3541  // Avoid creating no-op copies with the same src and dst reg class. These
3542  // confuse some of the machine passes.
3543  legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
3544  }
3545  }
3546 
3547  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
3548  // VGPR dest type and SGPR sources, insert copies so all operands are
3549  // VGPRs. This seems to help operand folding / the register coalescer.
3550  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
3551  MachineBasicBlock *MBB = MI.getParent();
3552  const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
3553  if (RI.hasVGPRs(DstRC)) {
3554  // Update all the operands so they are VGPR register classes. These may
3555  // not be the same register class because REG_SEQUENCE supports mixing
3556  // subregister index types e.g. sub0_sub1 + sub2 + sub3
3557  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3558  MachineOperand &Op = MI.getOperand(I);
3560  continue;
3561 
3562  const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
3563  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
3564  if (VRC == OpRC)
3565  continue;
3566 
3567  legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
3568  Op.setIsKill();
3569  }
3570  }
3571 
3572  return;
3573  }
3574 
3575  // Legalize INSERT_SUBREG
3576  // src0 must have the same register class as dst
3577  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
3578  unsigned Dst = MI.getOperand(0).getReg();
3579  unsigned Src0 = MI.getOperand(1).getReg();
3580  const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
3581  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
3582  if (DstRC != Src0RC) {
3583  MachineBasicBlock *MBB = MI.getParent();
3584  MachineOperand &Op = MI.getOperand(1);
3585  legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
3586  }
3587  return;
3588  }
3589 
3590  // Legalize SI_INIT_M0
3591  if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
3592  MachineOperand &Src = MI.getOperand(0);
3593  if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
3594  Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
3595  return;
3596  }
3597 
3598  // Legalize MIMG and MUBUF/MTBUF for shaders.
3599  //
3600  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
3601  // scratch memory access. In both cases, the legalization never involves
3602  // conversion to the addr64 form.
3603  if (isMIMG(MI) ||
3605  (isMUBUF(MI) || isMTBUF(MI)))) {
3606  MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
3607  if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
3608  unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
3609  SRsrc->setReg(SGPR);
3610  }
3611 
3612  MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
3613  if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
3614  unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
3615  SSamp->setReg(SGPR);
3616  }
3617  return;
3618  }
3619 
3620  // Legalize MUBUF* instructions by converting to addr64 form.
3621  // FIXME: If we start using the non-addr64 instructions for compute, we
3622  // may need to legalize them as above. This especially applies to the
3623  // buffer_load_format_* variants and variants with idxen (or bothen).
3624  int SRsrcIdx =
3625  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
3626  if (SRsrcIdx != -1) {
3627  // We have an MUBUF instruction
3628  MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
3629  unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
3630  if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
3631  RI.getRegClass(SRsrcRC))) {
3632  // The operands are legal.
3633  // FIXME: We may need to legalize operands besided srsrc.
3634  return;
3635  }
3636 
3637  MachineBasicBlock &MBB = *MI.getParent();
3638 
3639  // Extract the ptr from the resource descriptor.
3640  unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
3641  &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3642 
3643  // Create an empty resource descriptor
3644  unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3645  unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3646  unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3647  unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3648  uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
3649 
3650  // Zero64 = 0
3651  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
3652  .addImm(0);
3653 
3654  // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3655  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3656  .addImm(RsrcDataFormat & 0xFFFFFFFF);
3657 
3658  // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3659  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3660  .addImm(RsrcDataFormat >> 32);
3661 
3662  // NewSRsrc = {Zero64, SRsrcFormat}
3663  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3664  .addReg(Zero64)
3665  .addImm(AMDGPU::sub0_sub1)
3666  .addReg(SRsrcFormatLo)
3667  .addImm(AMDGPU::sub2)
3668  .addReg(SRsrcFormatHi)
3669  .addImm(AMDGPU::sub3);
3670 
3671  MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
3672  unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3673  if (VAddr) {
3674  // This is already an ADDR64 instruction so we need to add the pointer
3675  // extracted from the resource descriptor to the current value of VAddr.
3676  unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3677  unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3678 
3679  // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
3680  DebugLoc DL = MI.getDebugLoc();
3681  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
3682  .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3683  .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
3684 
3685  // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
3686  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
3687  .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3688  .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
3689 
3690  // NewVaddr = {NewVaddrHi, NewVaddrLo}
3691  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
3692  .addReg(NewVAddrLo)
3693  .addImm(AMDGPU::sub0)
3694  .addReg(NewVAddrHi)
3695  .addImm(AMDGPU::sub1);
3696  } else {
3697  // This instructions is the _OFFSET variant, so we need to convert it to
3698  // ADDR64.
3699  assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
3701  "FIXME: Need to emit flat atomics here");
3702 
3703  MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
3704  MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3705  MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
3706  unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
3707 
3708  // Atomics rith return have have an additional tied operand and are
3709  // missing some of the special bits.
3710  MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
3711  MachineInstr *Addr64;
3712 
3713  if (!VDataIn) {
3714  // Regular buffer load / store.
3715  MachineInstrBuilder MIB =
3716  BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3717  .add(*VData)
3718  .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3719  // This will be replaced later
3720  // with the new value of vaddr.
3721  .add(*SRsrc)
3722  .add(*SOffset)
3723  .add(*Offset);
3724 
3725  // Atomics do not have this operand.
3726  if (const MachineOperand *GLC =
3727  getNamedOperand(MI, AMDGPU::OpName::glc)) {
3728  MIB.addImm(GLC->getImm());
3729  }
3730 
3731  MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
3732 
3733  if (const MachineOperand *TFE =
3734  getNamedOperand(MI, AMDGPU::OpName::tfe)) {
3735  MIB.addImm(TFE->getImm());
3736  }
3737 
3739  Addr64 = MIB;
3740  } else {
3741  // Atomics with return.
3742  Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3743  .add(*VData)
3744  .add(*VDataIn)
3745  .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3746  // This will be replaced later
3747  // with the new value of vaddr.
3748  .add(*SRsrc)
3749  .add(*SOffset)
3750  .add(*Offset)
3751  .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
3753  }
3754 
3755  MI.removeFromParent();
3756 
3757  // NewVaddr = {NewVaddrHi, NewVaddrLo}
3758  BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
3759  NewVAddr)
3760  .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3761  .addImm(AMDGPU::sub0)
3762  .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3763  .addImm(AMDGPU::sub1);
3764 
3765  VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
3766  SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
3767  }
3768 
3769  // Update the instruction to use NewVaddr
3770  VAddr->setReg(NewVAddr);
3771  // Update the instruction to use NewSRsrc
3772  SRsrc->setReg(NewSRsrc);
3773  }
3774 }
3775 
3777  SetVectorType Worklist;
3778  Worklist.insert(&TopInst);
3779 
3780  while (!Worklist.empty()) {
3781  MachineInstr &Inst = *Worklist.pop_back_val();
3782  MachineBasicBlock *MBB = Inst.getParent();
3784 
3785  unsigned Opcode = Inst.getOpcode();
3786  unsigned NewOpcode = getVALUOp(Inst);
3787 
3788  // Handle some special cases
3789  switch (Opcode) {
3790  default:
3791  break;
3792  case AMDGPU::S_ADD_U64_PSEUDO:
3793  case AMDGPU::S_SUB_U64_PSEUDO:
3794  splitScalar64BitAddSub(Worklist, Inst);
3795  Inst.eraseFromParent();
3796  continue;
3797  case AMDGPU::S_ADD_I32:
3798  case AMDGPU::S_SUB_I32:
3799  // FIXME: The u32 versions currently selected use the carry.
3800  if (moveScalarAddSub(Worklist, Inst))
3801  continue;
3802 
3803  // Default handling
3804  break;
3805  case AMDGPU::S_AND_B64:
3806  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
3807  Inst.eraseFromParent();
3808  continue;
3809 
3810  case AMDGPU::S_OR_B64:
3811  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
3812  Inst.eraseFromParent();
3813  continue;
3814 
3815  case AMDGPU::S_XOR_B64:
3816  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
3817  Inst.eraseFromParent();
3818  continue;
3819 
3820  case AMDGPU::S_NOT_B64:
3821  splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
3822  Inst.eraseFromParent();
3823  continue;
3824 
3825  case AMDGPU::S_BCNT1_I32_B64:
3826  splitScalar64BitBCNT(Worklist, Inst);
3827  Inst.eraseFromParent();
3828  continue;
3829 
3830  case AMDGPU::S_BFE_I64:
3831  splitScalar64BitBFE(Worklist, Inst);
3832  Inst.eraseFromParent();
3833  continue;
3834 
3835  case AMDGPU::S_LSHL_B32:
3836  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
3837  NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
3838  swapOperands(Inst);
3839  }
3840  break;
3841  case AMDGPU::S_ASHR_I32:
3842  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
3843  NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
3844  swapOperands(Inst);
3845  }
3846  break;
3847  case AMDGPU::S_LSHR_B32:
3848  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
3849  NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
3850  swapOperands(Inst);
3851  }
3852  break;
3853  case AMDGPU::S_LSHL_B64:
3854  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
3855  NewOpcode = AMDGPU::V_LSHLREV_B64;
3856  swapOperands(Inst);
3857  }
3858  break;
3859  case AMDGPU::S_ASHR_I64:
3860  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
3861  NewOpcode = AMDGPU::V_ASHRREV_I64;
3862  swapOperands(Inst);
3863  }
3864  break;
3865  case AMDGPU::S_LSHR_B64:
3866  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
3867  NewOpcode = AMDGPU::V_LSHRREV_B64;
3868  swapOperands(Inst);
3869  }
3870  break;
3871 
3872  case AMDGPU::S_ABS_I32:
3873  lowerScalarAbs(Worklist, Inst);
3874  Inst.eraseFromParent();
3875  continue;
3876 
3877  case AMDGPU::S_CBRANCH_SCC0:
3878  case AMDGPU::S_CBRANCH_SCC1:
3879  // Clear unused bits of vcc
3880  BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
3881  AMDGPU::VCC)
3882  .addReg(AMDGPU::EXEC)
3883  .addReg(AMDGPU::VCC);
3884  break;
3885 
3886  case AMDGPU::S_BFE_U64:
3887  case AMDGPU::S_BFM_B64:
3888  llvm_unreachable("Moving this op to VALU not implemented");
3889 
3890  case AMDGPU::S_PACK_LL_B32_B16:
3891  case AMDGPU::S_PACK_LH_B32_B16:
3892  case AMDGPU::S_PACK_HH_B32_B16:
3893  movePackToVALU(Worklist, MRI, Inst);
3894  Inst.eraseFromParent();
3895  continue;
3896 
3897  case AMDGPU::S_XNOR_B32:
3898  lowerScalarXnor(Worklist, Inst);
3899  Inst.eraseFromParent();
3900  continue;
3901 
3902  case AMDGPU::S_XNOR_B64:
3903  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32);
3904  Inst.eraseFromParent();
3905  continue;
3906 
3907  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: {
3908  unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3909  const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
3910  auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
3911  unsigned Offset = 0;
3912 
3913  // FIXME: This isn't safe because the addressing mode doesn't work
3914  // correctly if vaddr is negative.
3915  //
3916  // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
3917  //
3918  // See if we can extract an immediate offset by recognizing one of these:
3919  // V_ADD_I32_e32 dst, imm, src1
3920  // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
3921  // V_ADD will be removed by "Remove dead machine instructions".
3922  if (Add &&
3923  (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
3924  Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
3925  static const unsigned SrcNames[2] = {
3926  AMDGPU::OpName::src0,
3927  AMDGPU::OpName::src1,
3928  };
3929 
3930  // Find a literal offset in one of source operands.
3931  for (int i = 0; i < 2; i++) {
3932  const MachineOperand *Src =
3933  getNamedOperand(*Add, SrcNames[i]);
3934 
3935  if (Src->isReg()) {
3936  auto Mov = MRI.getUniqueVRegDef(Src->getReg());
3937  if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
3938  Src = &Mov->getOperand(1);
3939  }
3940 
3941  if (Src) {
3942  if (Src->isImm())
3943  Offset = Src->getImm();
3944  else if (Src->isCImm())
3945  Offset = Src->getCImm()->getZExtValue();
3946  }
3947 
3948  if (Offset && isLegalMUBUFImmOffset(Offset)) {
3949  VAddr = getNamedOperand(*Add, SrcNames[!i]);
3950  break;
3951  }
3952 
3953  Offset = 0;
3954  }
3955  }
3956 
3957  MachineInstr *NewInstr =
3958  BuildMI(*MBB, Inst, Inst.getDebugLoc(),
3959  get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
3960  .add(*VAddr) // vaddr
3961  .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
3962  .addImm(0) // soffset
3963  .addImm(Offset) // offset
3964  .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
3965  .addImm(0) // slc
3966  .addImm(0) // tfe
3968  .getInstr();
3969 
3970  MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
3971  VDst);
3972  addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
3973  Inst.eraseFromParent();
3974 
3975  // Legalize all operands other than the offset. Notably, convert the srsrc
3976  // into SGPRs using v_readfirstlane if needed.
3977  legalizeOperands(*NewInstr);
3978  continue;
3979  }
3980  }
3981 
3982  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
3983  // We cannot move this instruction to the VALU, so we should try to
3984  // legalize its operands instead.
3985  legalizeOperands(Inst);
3986  continue;
3987  }
3988 
3989  // Use the new VALU Opcode.
3990  const MCInstrDesc &NewDesc = get(NewOpcode);
3991  Inst.setDesc(NewDesc);
3992 
3993  // Remove any references to SCC. Vector instructions can't read from it, and
3994  // We're just about to add the implicit use / defs of VCC, and we don't want
3995  // both.
3996  for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
3997  MachineOperand &Op = Inst.getOperand(i);
3998  if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
3999  Inst.RemoveOperand(i);
4000  addSCCDefUsersToVALUWorklist(Inst, Worklist);
4001  }
4002  }
4003 
4004  if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
4005  // We are converting these to a BFE, so we need to add the missing
4006  // operands for the size and offset.
4007  unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
4010 
4011  } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
4012  // The VALU version adds the second operand to the result, so insert an
4013  // extra 0 operand.
4015  }
4016 
4018 
4019  if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
4020  const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
4021  // If we need to move this to VGPRs, we need to unpack the second operand
4022  // back into the 2 separate ones for bit offset and width.
4023  assert(OffsetWidthOp.isImm() &&
4024  "Scalar BFE is only implemented for constant width and offset");
4025  uint32_t Imm = OffsetWidthOp.getImm();
4026 
4027  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4028  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4029  Inst.RemoveOperand(2); // Remove old immediate.
4030  Inst.addOperand(MachineOperand::CreateImm(Offset));
4031  Inst.addOperand(MachineOperand::CreateImm(BitWidth));
4032  }
4033 
4034  bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
4035  unsigned NewDstReg = AMDGPU::NoRegister;
4036  if (HasDst) {
4037  unsigned DstReg = Inst.getOperand(0).getReg();
4039  continue;
4040 
4041  // Update the destination register class.
4042  const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
4043  if (!NewDstRC)
4044  continue;
4045 
4046  if (Inst.isCopy() &&
4048  NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
4049  // Instead of creating a copy where src and dst are the same register
4050  // class, we just replace all uses of dst with src. These kinds of
4051  // copies interfere with the heuristics MachineSink uses to decide
4052  // whether or not to split a critical edge. Since the pass assumes
4053  // that copies will end up as machine instructions and not be
4054  // eliminated.
4055  addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
4056  MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
4057  MRI.clearKillFlags(Inst.getOperand(1).getReg());
4058  Inst.getOperand(0).setReg(DstReg);
4059 
4060  // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
4061  // these are deleted later, but at -O0 it would leave a suspicious
4062  // looking illegal copy of an undef register.
4063  for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
4064  Inst.RemoveOperand(I);
4065  Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
4066  continue;
4067  }
4068 
4069  NewDstReg = MRI.createVirtualRegister(NewDstRC);
4070  MRI.replaceRegWith(DstReg, NewDstReg);
4071  }
4072 
4073  // Legalize the operands
4074  legalizeOperands(Inst);
4075 
4076  if (HasDst)
4077  addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
4078  }
4079 }
4080 
4081 // Add/sub require special handling to deal with carry outs.
4082 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
4083  MachineInstr &Inst) const {
4084  if (ST.hasAddNoCarry()) {
4085  // Assume there is no user of scc since we don't select this in that case.
4086  // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
4087  // is used.
4088 
4089  MachineBasicBlock &MBB = *Inst.getParent();
4091 
4092  unsigned OldDstReg = Inst.getOperand(0).getReg();
4093  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4094 
4095  unsigned Opc = Inst.getOpcode();
4096  assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
4097 
4098  unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
4099  AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
4100 
4101  assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
4102  Inst.RemoveOperand(3);
4103 
4104  Inst.setDesc(get(NewOpc));
4105  Inst.addImplicitDefUseOperands(*MBB.getParent());
4106  MRI.replaceRegWith(OldDstReg, ResultReg);
4107  legalizeOperands(Inst);
4108 
4109  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4110  return true;
4111  }
4112 
4113  return false;
4114 }
4115 
4116 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
4117  MachineInstr &Inst) const {
4118  MachineBasicBlock &MBB = *Inst.getParent();
4120  MachineBasicBlock::iterator MII = Inst;
4121  DebugLoc DL = Inst.getDebugLoc();
4122 
4123  MachineOperand &Dest = Inst.getOperand(0);
4124  MachineOperand &Src = Inst.getOperand(1);
4125  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4126  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4127 
4128  unsigned SubOp = ST.hasAddNoCarry() ?
4129  AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
4130 
4131  BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
4132  .addImm(0)
4133  .addReg(Src.getReg());
4134 
4135  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
4136  .addReg(Src.getReg())
4137  .addReg(TmpReg);
4138 
4139  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4140  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4141 }
4142 
4143 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
4144  MachineInstr &Inst) const {
4145  MachineBasicBlock &MBB = *Inst.getParent();
4147  MachineBasicBlock::iterator MII = Inst;
4148  const DebugLoc &DL = Inst.getDebugLoc();
4149 
4150  MachineOperand &Dest = Inst.getOperand(0);
4151  MachineOperand &Src0 = Inst.getOperand(1);
4152  MachineOperand &Src1 = Inst.getOperand(2);
4153 
4154  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
4155  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
4156 
4157  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4158  if (ST.hasDLInsts()) {
4159  BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
4160  .add(Src0)
4161  .add(Src1);
4162  } else {
4163  unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4164  BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
4165  .add(Src0)
4166  .add(Src1);
4167 
4168  BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
4169  .addReg(Xor);
4170  }
4171 
4172  MRI.replaceRegWith(Dest.getReg(), NewDest);
4173  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4174 }
4175 
4176 void SIInstrInfo::splitScalar64BitUnaryOp(
4177  SetVectorType &Worklist, MachineInstr &Inst,
4178  unsigned Opcode) const {
4179  MachineBasicBlock &MBB = *Inst.getParent();
4181 
4182  MachineOperand &Dest = Inst.getOperand(0);
4183  MachineOperand &Src0 = Inst.getOperand(1);
4184  DebugLoc DL = Inst.getDebugLoc();
4185 
4186  MachineBasicBlock::iterator MII = Inst;
4187 
4188  const MCInstrDesc &InstDesc = get(Opcode);
4189  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4190  MRI.getRegClass(Src0.getReg()) :
4191  &AMDGPU::SGPR_32RegClass;
4192 
4193  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4194 
4195  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4196  AMDGPU::sub0, Src0SubRC);
4197 
4198  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4199  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4200  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4201 
4202  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4203  BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
4204 
4205  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4206  AMDGPU::sub1, Src0SubRC);
4207 
4208  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4209  BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
4210 
4211  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4212  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4213  .addReg(DestSub0)
4214  .addImm(AMDGPU::sub0)
4215  .addReg(DestSub1)
4216  .addImm(AMDGPU::sub1);
4217 
4218  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4219 
4220  // We don't need to legalizeOperands here because for a single operand, src0
4221  // will support any kind of input.
4222 
4223  // Move all users of this moved value.
4224  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4225 }
4226 
4227 void SIInstrInfo::splitScalar64BitAddSub(
4228  SetVectorType &Worklist, MachineInstr &Inst) const {
4229  bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4230 
4231  MachineBasicBlock &MBB = *Inst.getParent();
4233 
4234  unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4235  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4236  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4237 
4238  unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4239  unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4240 
4241  MachineOperand &Dest = Inst.getOperand(0);
4242  MachineOperand &Src0 = Inst.getOperand(1);
4243  MachineOperand &Src1 = Inst.getOperand(2);
4244  const DebugLoc &DL = Inst.getDebugLoc();
4245  MachineBasicBlock::iterator MII = Inst;
4246 
4247  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
4248  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
4249  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4250  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4251 
4252  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4253  AMDGPU::sub0, Src0SubRC);
4254  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4255  AMDGPU::sub0, Src1SubRC);
4256 
4257 
4258  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4259  AMDGPU::sub1, Src0SubRC);
4260  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4261  AMDGPU::sub1, Src1SubRC);
4262 
4263  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
4264  MachineInstr *LoHalf =
4265  BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
4266  .addReg(CarryReg, RegState::Define)
4267  .add(SrcReg0Sub0)
4268  .add(SrcReg1Sub0);
4269 
4270  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4271  MachineInstr *HiHalf =
4272  BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
4273  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4274  .add(SrcReg0Sub1)
4275  .add(SrcReg1Sub1)
4276  .addReg(CarryReg, RegState::Kill);
4277 
4278  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4279  .addReg(DestSub0)
4280  .addImm(AMDGPU::sub0)
4281  .addReg(DestSub1)
4282  .addImm(AMDGPU::sub1);
4283 
4284  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4285 
4286  // Try to legalize the operands in case we need to swap the order to keep it
4287  // valid.
4288  legalizeOperands(*LoHalf);
4289  legalizeOperands(*HiHalf);
4290 
4291  // Move all users of this moved vlaue.
4292  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4293 }
4294 
4295 void SIInstrInfo::splitScalar64BitBinaryOp(
4296  SetVectorType &Worklist, MachineInstr &Inst,
4297  unsigned Opcode) const {
4298  MachineBasicBlock &MBB = *Inst.getParent();
4300 
4301  MachineOperand &Dest = Inst.getOperand(0);
4302  MachineOperand &Src0 = Inst.getOperand(1);
4303  MachineOperand &Src1 = Inst.getOperand(2);
4304  DebugLoc DL = Inst.getDebugLoc();
4305 
4306  MachineBasicBlock::iterator MII = Inst;
4307 
4308  const MCInstrDesc &InstDesc = get(Opcode);
4309  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4310  MRI.getRegClass(Src0.getReg()) :
4311  &AMDGPU::SGPR_32RegClass;
4312 
4313  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4314  const TargetRegisterClass *Src1RC = Src1.isReg() ?
4315  MRI.getRegClass(Src1.getReg()) :
4316  &AMDGPU::SGPR_32RegClass;
4317 
4318  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4319 
4320  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4321  AMDGPU::sub0, Src0SubRC);
4322  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4323  AMDGPU::sub0, Src1SubRC);
4324 
4325  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4326  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4327  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4328 
4329  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4330  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
4331  .add(SrcReg0Sub0)
4332  .add(SrcReg1Sub0);
4333 
4334  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4335  AMDGPU::sub1, Src0SubRC);
4336  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4337  AMDGPU::sub1, Src1SubRC);
4338 
4339  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4340  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
4341  .add(SrcReg0Sub1)
4342  .add(SrcReg1Sub1);
4343 
4344  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4345  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4346  .addReg(DestSub0)
4347  .addImm(AMDGPU::sub0)
4348  .addReg(DestSub1)
4349  .addImm(AMDGPU::sub1);
4350 
4351  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4352 
4353  // Try to legalize the operands in case we need to swap the order to keep it
4354  // valid.
4355  legalizeOperands(LoHalf);
4356  legalizeOperands(HiHalf);
4357 
4358  // Move all users of this moved vlaue.
4359  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4360 }
4361 
4362 void SIInstrInfo::splitScalar64BitBCNT(
4363  SetVectorType &Worklist, MachineInstr &Inst) const {
4364  MachineBasicBlock &MBB = *Inst.getParent();
4366 
4367  MachineBasicBlock::iterator MII = Inst;
4368  DebugLoc DL = Inst.getDebugLoc();
4369 
4370  MachineOperand &Dest = Inst.getOperand(0);
4371  MachineOperand &Src = Inst.getOperand(1);
4372 
4373  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
4374  const TargetRegisterClass *SrcRC = Src.isReg() ?
4375  MRI.getRegClass(Src.getReg()) :
4376  &AMDGPU::SGPR_32RegClass;
4377 
4378  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4379  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4380 
4381  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
4382 
4383  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4384  AMDGPU::sub0, SrcSubRC);
4385  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4386  AMDGPU::sub1, SrcSubRC);
4387 
4388  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
4389 
4390  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
4391 
4392  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4393 
4394  // We don't need to legalize operands here. src0 for etiher instruction can be
4395  // an SGPR, and the second input is unused or determined here.
4396  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4397 }
4398 
4399 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
4400  MachineInstr &Inst) const {
4401  MachineBasicBlock &MBB = *Inst.getParent();
4403  MachineBasicBlock::iterator MII = Inst;
4404  DebugLoc DL = Inst.getDebugLoc();
4405 
4406  MachineOperand &Dest = Inst.getOperand(0);
4407  uint32_t Imm = Inst.getOperand(2).getImm();
4408  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4409  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4410 
4411  (void) Offset;
4412 
4413  // Only sext_inreg cases handled.
4414  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
4415  Offset == 0 && "Not implemented");
4416 
4417  if (BitWidth < 32) {
4418  unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4419  unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4420  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4421 
4422  BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
4423  .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
4424  .addImm(0)
4425  .addImm(BitWidth);
4426 
4427  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
4428  .addImm(31)
4429  .addReg(MidRegLo);
4430 
4431  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4432  .addReg(MidRegLo)
4433  .addImm(AMDGPU::sub0)
4434  .addReg(MidRegHi)
4435  .addImm(AMDGPU::sub1);
4436 
4437  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4438  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4439  return;
4440  }
4441 
4442  MachineOperand &Src = Inst.getOperand(1);
4443  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4444  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4445 
4446  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
4447  .addImm(31)
4448  .addReg(Src.getReg(), 0, AMDGPU::sub0);
4449 
4450  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4451  .addReg(Src.getReg(), 0, AMDGPU::sub0)
4452  .addImm(AMDGPU::sub0)
4453  .addReg(TmpReg)
4454  .addImm(AMDGPU::sub1);
4455 
4456  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4457  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4458 }
4459 
4460 void SIInstrInfo::addUsersToMoveToVALUWorklist(
4461  unsigned DstReg,
4463  SetVectorType &Worklist) const {
4464  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
4465  E = MRI.use_end(); I != E;) {
4466  MachineInstr &UseMI = *I->getParent();
4467  if (!canReadVGPR(UseMI, I.getOperandNo())) {
4468  Worklist.insert(&UseMI);
4469 
4470  do {
4471  ++I;
4472  } while (I != E && I->getParent() == &UseMI);
4473  } else {
4474  ++I;
4475  }
4476  }
4477 }
4478 
4479 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
4480  MachineRegisterInfo &MRI,
4481  MachineInstr &Inst) const {
4482  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4483  MachineBasicBlock *MBB = Inst.getParent();
4484  MachineOperand &Src0 = Inst.getOperand(1);
4485  MachineOperand &Src1 = Inst.getOperand(2);
4486  const DebugLoc &DL = Inst.getDebugLoc();
4487 
4488  switch (Inst.getOpcode()) {
4489  case AMDGPU::S_PACK_LL_B32_B16: {
4490  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4491  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4492 
4493  // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
4494  // 0.
4495  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4496  .addImm(0xffff);
4497 
4498  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
4499  .addReg(ImmReg, RegState::Kill)
4500  .add(Src0);
4501 
4502  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
4503  .add(Src1)
4504  .addImm(16)
4505  .addReg(TmpReg, RegState::Kill);
4506  break;
4507  }
4508  case AMDGPU::S_PACK_LH_B32_B16: {
4509  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4510  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4511  .addImm(0xffff);
4512  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
4513  .addReg(ImmReg, RegState::Kill)
4514  .add(Src0)
4515  .add(Src1);
4516  break;
4517  }
4518  case AMDGPU::S_PACK_HH_B32_B16: {
4519  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4520  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4521  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
4522  .addImm(16)
4523  .add(Src0);
4524  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4525  .addImm(0xffff0000);
4526  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
4527  .add(Src1)
4528  .addReg(ImmReg, RegState::Kill)
4529  .addReg(TmpReg, RegState::Kill);
4530  break;
4531  }
4532  default:
4533  llvm_unreachable("unhandled s_pack_* instruction");
4534  }
4535 
4536  MachineOperand &Dest = Inst.getOperand(0);
4537  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4538  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4539 }
4540 
4541 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
4542  MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
4543  // This assumes that all the users of SCC are in the same block
4544  // as the SCC def.
4545  for (MachineInstr &MI :
4547  SCCDefInst.getParent()->end())) {
4548  // Exit if we find another SCC def.
4549  if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
4550  return;
4551 
4552  if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
4553  Worklist.insert(&MI);
4554  }
4555 }
4556 
4557 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
4558  const MachineInstr &Inst) const {
4559  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
4560 
4561  switch (Inst.getOpcode()) {
4562  // For target instructions, getOpRegClass just returns the virtual register
4563  // class associated with the operand, so we need to find an equivalent VGPR
4564  // register class in order to move the instruction to the VALU.
4565  case AMDGPU::COPY:
4566  case AMDGPU::PHI:
4567  case AMDGPU::REG_SEQUENCE:
4568  case AMDGPU::INSERT_SUBREG:
4569  case AMDGPU::WQM:
4570  case AMDGPU::WWM:
4571  if (RI.hasVGPRs(NewDstRC))
4572  return nullptr;
4573 
4574  NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
4575  if (!NewDstRC)
4576  return nullptr;
4577  return NewDstRC;
4578  default:
4579  return NewDstRC;
4580  }
4581 }
4582 
4583 // Find the one SGPR operand we are allowed to use.
4584 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
4585  int OpIndices[3]) const {
4586  const MCInstrDesc &Desc = MI.getDesc();
4587 
4588  // Find the one SGPR operand we are allowed to use.
4589  //
4590  // First we need to consider the instruction's operand requirements before
4591  // legalizing. Some operands are required to be SGPRs, such as implicit uses
4592  // of VCC, but we are still bound by the constant bus requirement to only use
4593  // one.
4594  //
4595  // If the operand's class is an SGPR, we can never move it.
4596 
4597  unsigned SGPRReg = findImplicitSGPRRead(MI);
4598  if (SGPRReg != AMDGPU::NoRegister)
4599  return SGPRReg;
4600 
4601  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
4602  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4603 
4604  for (unsigned i = 0; i < 3; ++i) {
4605  int Idx = OpIndices[i];
4606  if (Idx == -1)
4607  break;
4608 
4609  const MachineOperand &MO = MI.getOperand(Idx);
4610  if (!MO.isReg())
4611  continue;
4612 
4613  // Is this operand statically required to be an SGPR based on the operand
4614  // constraints?
4615  const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
4616  bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
4617  if (IsRequiredSGPR)
4618  return MO.getReg();
4619 
4620  // If this could be a VGPR or an SGPR, Check the dynamic register class.
4621  unsigned Reg = MO.getReg();
4622  const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
4623  if (RI.isSGPRClass(RegRC))
4624  UsedSGPRs[i] = Reg;
4625  }
4626 
4627  // We don't have a required SGPR operand, so we have a bit more freedom in
4628  // selecting operands to move.
4629 
4630  // Try to select the most used SGPR. If an SGPR is equal to one of the
4631  // others, we choose that.
4632  //
4633  // e.g.
4634  // V_FMA_F32 v0, s0, s0, s0 -> No moves
4635  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
4636 
4637  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
4638  // prefer those.
4639 
4640  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
4641  if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
4642  SGPRReg = UsedSGPRs[0];
4643  }
4644 
4645  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
4646  if (UsedSGPRs[1] == UsedSGPRs[2])
4647  SGPRReg = UsedSGPRs[1];
4648  }
4649 
4650  return SGPRReg;
4651 }
4652 
4654  unsigned OperandName) const {
4655  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
4656  if (Idx == -1)
4657  return nullptr;
4658 
4659  return &MI.getOperand(Idx);
4660 }
4661 
4663  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
4664  if (ST.isAmdHsaOS()) {
4665  // Set ATC = 1. GFX9 doesn't have this bit.
4666  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4667  RsrcDataFormat |= (1ULL << 56);
4668 
4669  // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
4670  // BTW, it disables TC L2 and therefore decreases performance.
4671  if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
4672  RsrcDataFormat |= (2ULL << 59);
4673  }
4674 
4675  return RsrcDataFormat;
4676 }
4677 
4679  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
4681  0xffffffff; // Size;
4682 
4683  // GFX9 doesn't have ELEMENT_SIZE.
4684  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4685  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
4686  Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
4687  }
4688 
4689  // IndexStride = 64.
4690  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
4691 
4692  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
4693  // Clear them unless we want a huge stride.
4694  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4695  Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
4696 
4697  return Rsrc23;
4698 }
4699 
4701  unsigned Opc = MI.getOpcode();
4702 
4703  return isSMRD(Opc);
4704 }
4705 
4707  unsigned Opc = MI.getOpcode();
4708 
4709  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
4710 }
4711 
4713  int &FrameIndex) const {
4714  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4715  if (!Addr || !Addr->isFI())
4716  return AMDGPU::NoRegister;
4717 
4718  assert(!MI.memoperands_empty() &&
4719  (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS);
4720 
4721  FrameIndex = Addr->getIndex();
4722  return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
4723 }
4724 
4726  int &FrameIndex) const {
4727  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
4728  assert(Addr && Addr->isFI());
4729  FrameIndex = Addr->getIndex();
4730  return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
4731 }
4732 
4734  int &FrameIndex) const {
4735  if (!MI.mayLoad())
4736  return AMDGPU::NoRegister;
4737 
4738  if (isMUBUF(MI) || isVGPRSpill(MI))
4739  return isStackAccess(MI, FrameIndex);
4740 
4741  if (isSGPRSpill(MI))
4742  return isSGPRStackAccess(MI, FrameIndex);
4743 
4744  return AMDGPU::NoRegister;
4745 }
4746 
4748  int &FrameIndex) const {
4749  if (!MI.mayStore())
4750  return AMDGPU::NoRegister;
4751 
4752  if (isMUBUF(MI) || isVGPRSpill(MI))
4753  return isStackAccess(MI, FrameIndex);
4754 
4755  if (isSGPRSpill(MI))
4756  return isSGPRStackAccess(MI, FrameIndex);
4757 
4758  return AMDGPU::NoRegister;
4759 }
4760 
4762  unsigned Size = 0;
4765  while (++I != E && I->isInsideBundle()) {
4766  assert(!I->isBundle() && "No nested bundle!");
4767  Size += getInstSizeInBytes(*I);
4768  }
4769 
4770  return Size;
4771 }
4772 
4774  unsigned Opc = MI.getOpcode();
4775  const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
4776  unsigned DescSize = Desc.getSize();
4777 
4778  // If we have a definitive size, we can use it. Otherwise we need to inspect
4779  // the operands to know the size.
4780  //
4781  // FIXME: Instructions that have a base 32-bit encoding report their size as
4782  // 4, even though they are really 8 bytes if they have a literal operand.
4783  if (DescSize != 0 && DescSize != 4)
4784  return DescSize;
4785 
4786  if (isFixedSize(MI))
4787  return DescSize;
4788 
4789  // 4-byte instructions may have a 32-bit literal encoded after them. Check
4790  // operands that coud ever be literals.
4791  if (isVALU(MI) || isSALU(MI)) {
4792  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
4793  if (Src0Idx == -1)
4794  return 4; // No operands.
4795 
4796  if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
4797  return 8;
4798 
4799  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4800  if (Src1Idx == -1)
4801  return 4;
4802 
4803  if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
4804  return 8;
4805 
4806  return 4;
4807  }
4808 
4809  if (DescSize == 4)
4810  return 4;
4811 
4812  switch (Opc) {
4813  case TargetOpcode::IMPLICIT_DEF:
4814  case TargetOpcode::KILL:
4815  case TargetOpcode::DBG_VALUE:
4817  return 0;
4818  case TargetOpcode::BUNDLE:
4819  return getInstBundleSize(MI);
4820  case TargetOpcode::INLINEASM: {
4821  const MachineFunction *MF = MI.getParent()->getParent();
4822  const char *AsmStr = MI.getOperand(0).getSymbolName();
4823  return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
4824  }
4825  default:
4826  llvm_unreachable("unable to find instruction size");
4827  }
4828 }
4829 
4831  if (!isFLAT(MI))
4832  return false;
4833 
4834  if (MI.memoperands_empty())
4835  return true;
4836 
4837  for (const MachineMemOperand *MMO : MI.memoperands()) {
4838  if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS)
4839  return true;
4840  }
4841  return false;
4842 }
4843 
4845  return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
4846 }
4847 
4849  MachineBasicBlock *IfEnd) const {
4851  assert(TI != IfEntry->end());
4852 
4853  MachineInstr *Branch = &(*TI);
4854  MachineFunction *MF = IfEntry->getParent();
4855  MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
4856 
4857  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
4858  unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4859  MachineInstr *SIIF =
4860  BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
4861  .add(Branch->getOperand(0))
4862  .add(Branch->getOperand(1));
4863  MachineInstr *SIEND =
4864  BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
4865  .addReg(DstReg);
4866 
4867  IfEntry->erase(TI);
4868  IfEntry->insert(IfEntry->end(), SIIF);
4869  IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
4870  }
4871 }
4872 
4874  MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
4876  // We expect 2 terminators, one conditional and one unconditional.
4877  assert(TI != LoopEnd->end());
4878 
4879  MachineInstr *Branch = &(*TI);
4880  MachineFunction *MF = LoopEnd->getParent();
4881  MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
4882 
4883  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
4884 
4885  unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4886  unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4887  MachineInstrBuilder HeaderPHIBuilder =
4888  BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
4889  for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
4890  E = LoopEntry->pred_end();
4891  PI != E; ++PI) {
4892  if (*PI == LoopEnd) {
4893  HeaderPHIBuilder.addReg(BackEdgeReg);
4894  } else {
4895  MachineBasicBlock *PMBB = *PI;
4896  unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4898  ZeroReg, 0);
4899  HeaderPHIBuilder.addReg(ZeroReg);
4900  }
4901  HeaderPHIBuilder.addMBB(*PI);
4902  }
4903  MachineInstr *HeaderPhi = HeaderPHIBuilder;
4904  MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
4905  get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
4906  .addReg(DstReg)
4907  .add(Branch->getOperand(0));
4908  MachineInstr *SILOOP =
4909  BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
4910  .addReg(BackEdgeReg)
4911  .addMBB(LoopEntry);
4912 
4913  LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
4914  LoopEnd->erase(TI);
4915  LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
4916  LoopEnd->insert(LoopEnd->end(), SILOOP);
4917  }
4918 }
4919 
4922  static const std::pair<int, const char *> TargetIndices[] = {
4923  {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
4924  {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
4925  {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
4926  {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
4927  {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
4928  return makeArrayRef(TargetIndices);
4929 }
4930 
4931 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
4932 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
4935  const ScheduleDAG *DAG) const {
4936  return new GCNHazardRecognizer(DAG->MF);
4937 }
4938 
4939 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
4940 /// pass.
4943  return new GCNHazardRecognizer(MF);
4944 }
4945 
4946 std::pair<unsigned, unsigned>
4948  return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
4949 }
4950 
4953  static const std::pair<unsigned, const char *> TargetFlags[] = {
4954  { MO_GOTPCREL, "amdgpu-gotprel" },
4955  { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
4956  { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
4957  { MO_REL32_LO, "amdgpu-rel32-lo" },
4958  { MO_REL32_HI, "amdgpu-rel32-hi" }
4959  };
4960 
4961  return makeArrayRef(TargetFlags);
4962 }
4963 
4965  return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
4966  MI.modifiesRegister(AMDGPU::EXEC, &RI);
4967 }
4968 
4972  const DebugLoc &DL,
4973  unsigned DestReg) const {
4974  if (ST.hasAddNoCarry())
4975  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
4976 
4977  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4978  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4979  MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC);
4980 
4981  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
4982  .addReg(UnusedCarry, RegState::Define | RegState::Dead);
4983 }
4984 
4985 bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
4986  switch (Opcode) {
4987  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
4988  case AMDGPU::SI_KILL_I1_TERMINATOR:
4989  return true;
4990  default:
4991  return false;
4992  }
4993 }
4994 
4996  switch (Opcode) {
4997  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
4998  return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
4999  case AMDGPU::SI_KILL_I1_PSEUDO:
5000  return get(AMDGPU::SI_KILL_I1_TERMINATOR);
5001  default:
5002  llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
5003  }
5004 }
5005 
5007  if (!isSMRD(MI))
5008  return false;
5009 
5010  // Check that it is using a buffer resource.
5011  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
5012  if (Idx == -1) // e.g. s_memtime
5013  return false;
5014 
5015  const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
5016  return RCID == AMDGPU::SReg_128RegClassID;
5017 }
5018 
5019 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
5021  SI = 0,
5022  VI = 1,
5023  SDWA = 2,
5024  SDWA9 = 3,
5025  GFX80 = 4,
5026  GFX9 = 5
5027 };
5028 
5030  switch (ST.getGeneration()) {
5031  default:
5032  break;
5035  return SIEncodingFamily::SI;
5037  case AMDGPUSubtarget::GFX9:
5038  return SIEncodingFamily::VI;
5039  }
5040  llvm_unreachable("Unknown subtarget generation!");
5041 }
5042 
5043 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
5045 
5046  if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
5047  ST.getGeneration() >= AMDGPUSubtarget::GFX9)
5048  Gen = SIEncodingFamily::GFX9;
5049 
5050  if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
5051  Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
5053  // Adjust the encoding family to GFX80 for D16 buffer instructions when the
5054  // subtarget has UnpackedD16VMem feature.
5055  // TODO: remove this when we discard GFX80 encoding.
5056  if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
5058 
5059  int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
5060 
5061  // -1 means that Opcode is already a native instruction.
5062  if (MCOp == -1)
5063  return Opcode;
5064 
5065  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
5066  // no encoding in the given subtarget generation.
5067  if (MCOp == (uint16_t)-1)
5068  return -1;
5069 
5070  return MCOp;
5071 }
unsigned getTargetFlags() const
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
uint64_t CallInst * C
unsigned getNumImplicitUses() const
Return the number of implicit uses this instruction has.
Definition: MCInstrDesc.h:520
bool hasSDWAOmod() const
unsigned getVALUOp(const MachineInstr &MI) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
void legalizeOperands(MachineInstr &MI) const
Legalize all operands in this instruction.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
EVT getValueType() const
Return the ValueType of the referenced return value.
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:502
Interface definition for SIRegisterInfo.
bool hasRegisterImplicitUseOperand(unsigned Reg) const
Returns true if the MachineInstr has an implicit-use operand of exactly the given register (not consi...
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DstReg, ArrayRef< MachineOperand > Cond, unsigned TrueReg, unsigned FalseReg) const override
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register...
bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const
bool contains(unsigned Reg) const
Return true if the specified register is included in this register class.
Diagnostic information for unsupported feature in backend.
AMDGPU specific subclass of TargetSubtarget.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool IsDead
instr_iterator instr_end()
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:592
MachineBasicBlock * getMBB() const
bool hasScalarStores() const
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
LLVM_NODISCARD T pop_back_val()
Definition: SetVector.h:228
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
static bool sopkIsZext(const MachineInstr &MI)
Definition: SIInstrInfo.h:546
uint64_t getDefaultRsrcDataFormat() const
void ChangeToRegister(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value...
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static bool isStride64(unsigned Opc)
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, unsigned Offset, unsigned Size) const
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:285
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:162
unsigned getReg() const
getReg - Returns the register number.
void setIsUndef(bool Val=true)
unsigned insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned SrcReg, int Value) const
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
unsigned Reg
unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool hasInv2PiInlineImm() const
unsigned getSubReg() const
bool isInlineAsm() const
Definition: MachineInstr.h:867
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:348
unsigned getRegBitWidth(unsigned RCID)
Get the size in bits of a register from the register class RC.
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const