LLVM  9.0.0svn
SIInstrInfo.cpp
Go to the documentation of this file.
1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Implementation of TargetInstrInfo.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIInstrInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "GCNHazardRecognizer.h"
18 #include "SIDefines.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/StringRef.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/IR/InlineAsm.h"
50 #include "llvm/IR/LLVMContext.h"
51 #include "llvm/MC/MCInstrDesc.h"
52 #include "llvm/Support/Casting.h"
54 #include "llvm/Support/Compiler.h"
59 #include <cassert>
60 #include <cstdint>
61 #include <iterator>
62 #include <utility>
63 
64 using namespace llvm;
65 
66 #define GET_INSTRINFO_CTOR_DTOR
67 #include "AMDGPUGenInstrInfo.inc"
68 
69 namespace llvm {
70 namespace AMDGPU {
71 #define GET_D16ImageDimIntrinsics_IMPL
72 #define GET_ImageDimIntrinsicTable_IMPL
73 #define GET_RsrcIntrinsics_IMPL
74 #include "AMDGPUGenSearchableTables.inc"
75 }
76 }
77 
78 
79 // Must be at least 4 to be able to branch over minimum unconditional branch
80 // code. This is only for making it possible to write reasonably small tests for
81 // long branches.
82 static cl::opt<unsigned>
83 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
84  cl::desc("Restrict range of branch instructions (DEBUG)"));
85 
87  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
88  RI(ST), ST(ST) {}
89 
90 //===----------------------------------------------------------------------===//
91 // TargetInstrInfo callbacks
92 //===----------------------------------------------------------------------===//
93 
94 static unsigned getNumOperandsNoGlue(SDNode *Node) {
95  unsigned N = Node->getNumOperands();
96  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
97  --N;
98  return N;
99 }
100 
101 /// Returns true if both nodes have the same value for the given
102 /// operand \p Op, or if both nodes do not have this operand.
103 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
104  unsigned Opc0 = N0->getMachineOpcode();
105  unsigned Opc1 = N1->getMachineOpcode();
106 
107  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
108  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
109 
110  if (Op0Idx == -1 && Op1Idx == -1)
111  return true;
112 
113 
114  if ((Op0Idx == -1 && Op1Idx != -1) ||
115  (Op1Idx == -1 && Op0Idx != -1))
116  return false;
117 
118  // getNamedOperandIdx returns the index for the MachineInstr's operands,
119  // which includes the result as the first operand. We are indexing into the
120  // MachineSDNode's operands, so we need to skip the result operand to get
121  // the real index.
122  --Op0Idx;
123  --Op1Idx;
124 
125  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
126 }
127 
129  AliasAnalysis *AA) const {
130  // TODO: The generic check fails for VALU instructions that should be
131  // rematerializable due to implicit reads of exec. We really want all of the
132  // generic logic for this except for this.
133  switch (MI.getOpcode()) {
134  case AMDGPU::V_MOV_B32_e32:
135  case AMDGPU::V_MOV_B32_e64:
136  case AMDGPU::V_MOV_B64_PSEUDO:
137  // No implicit operands.
138  return MI.getNumOperands() == MI.getDesc().getNumOperands();
139  default:
140  return false;
141  }
142 }
143 
145  int64_t &Offset0,
146  int64_t &Offset1) const {
147  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
148  return false;
149 
150  unsigned Opc0 = Load0->getMachineOpcode();
151  unsigned Opc1 = Load1->getMachineOpcode();
152 
153  // Make sure both are actually loads.
154  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
155  return false;
156 
157  if (isDS(Opc0) && isDS(Opc1)) {
158 
159  // FIXME: Handle this case:
160  if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
161  return false;
162 
163  // Check base reg.
164  if (Load0->getOperand(0) != Load1->getOperand(0))
165  return false;
166 
167  // Skip read2 / write2 variants for simplicity.
168  // TODO: We should report true if the used offsets are adjacent (excluded
169  // st64 versions).
170  int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
171  int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
172  if (Offset0Idx == -1 || Offset1Idx == -1)
173  return false;
174 
175  // XXX - be careful of datalesss loads
176  // getNamedOperandIdx returns the index for MachineInstrs. Since they
177  // include the output in the operand list, but SDNodes don't, we need to
178  // subtract the index by one.
179  Offset0Idx -= get(Opc0).NumDefs;
180  Offset1Idx -= get(Opc1).NumDefs;
181  Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
182  Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
183  return true;
184  }
185 
186  if (isSMRD(Opc0) && isSMRD(Opc1)) {
187  // Skip time and cache invalidation instructions.
188  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
189  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
190  return false;
191 
193 
194  // Check base reg.
195  if (Load0->getOperand(0) != Load1->getOperand(0))
196  return false;
197 
198  const ConstantSDNode *Load0Offset =
199  dyn_cast<ConstantSDNode>(Load0->getOperand(1));
200  const ConstantSDNode *Load1Offset =
201  dyn_cast<ConstantSDNode>(Load1->getOperand(1));
202 
203  if (!Load0Offset || !Load1Offset)
204  return false;
205 
206  Offset0 = Load0Offset->getZExtValue();
207  Offset1 = Load1Offset->getZExtValue();
208  return true;
209  }
210 
211  // MUBUF and MTBUF can access the same addresses.
212  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
213 
214  // MUBUF and MTBUF have vaddr at different indices.
215  if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
216  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
217  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
218  return false;
219 
220  int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
221  int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
222 
223  if (OffIdx0 == -1 || OffIdx1 == -1)
224  return false;
225 
226  // getNamedOperandIdx returns the index for MachineInstrs. Since they
227  // include the output in the operand list, but SDNodes don't, we need to
228  // subtract the index by one.
229  OffIdx0 -= get(Opc0).NumDefs;
230  OffIdx1 -= get(Opc1).NumDefs;
231 
232  SDValue Off0 = Load0->getOperand(OffIdx0);
233  SDValue Off1 = Load1->getOperand(OffIdx1);
234 
235  // The offset might be a FrameIndexSDNode.
236  if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
237  return false;
238 
239  Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
240  Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
241  return true;
242  }
243 
244  return false;
245 }
246 
247 static bool isStride64(unsigned Opc) {
248  switch (Opc) {
249  case AMDGPU::DS_READ2ST64_B32:
250  case AMDGPU::DS_READ2ST64_B64:
251  case AMDGPU::DS_WRITE2ST64_B32:
252  case AMDGPU::DS_WRITE2ST64_B64:
253  return true;
254  default:
255  return false;
256  }
257 }
258 
260  const MachineOperand *&BaseOp,
261  int64_t &Offset,
262  const TargetRegisterInfo *TRI) const {
263  unsigned Opc = LdSt.getOpcode();
264 
265  if (isDS(LdSt)) {
266  const MachineOperand *OffsetImm =
267  getNamedOperand(LdSt, AMDGPU::OpName::offset);
268  if (OffsetImm) {
269  // Normal, single offset LDS instruction.
270  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
271  // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
272  // report that here?
273  if (!BaseOp)
274  return false;
275 
276  Offset = OffsetImm->getImm();
277  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
278  "operands of type register.");
279  return true;
280  }
281 
282  // The 2 offset instructions use offset0 and offset1 instead. We can treat
283  // these as a load with a single offset if the 2 offsets are consecutive. We
284  // will use this for some partially aligned loads.
285  const MachineOperand *Offset0Imm =
286  getNamedOperand(LdSt, AMDGPU::OpName::offset0);
287  const MachineOperand *Offset1Imm =
288  getNamedOperand(LdSt, AMDGPU::OpName::offset1);
289 
290  uint8_t Offset0 = Offset0Imm->getImm();
291  uint8_t Offset1 = Offset1Imm->getImm();
292 
293  if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
294  // Each of these offsets is in element sized units, so we need to convert
295  // to bytes of the individual reads.
296 
297  unsigned EltSize;
298  if (LdSt.mayLoad())
299  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
300  else {
301  assert(LdSt.mayStore());
302  int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
303  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
304  }
305 
306  if (isStride64(Opc))
307  EltSize *= 64;
308 
309  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
310  Offset = EltSize * Offset0;
311  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
312  "operands of type register.");
313  return true;
314  }
315 
316  return false;
317  }
318 
319  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
320  const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
321  if (SOffset && SOffset->isReg())
322  return false;
323 
324  const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
325  if (!AddrReg)
326  return false;
327 
328  const MachineOperand *OffsetImm =
329  getNamedOperand(LdSt, AMDGPU::OpName::offset);
330  BaseOp = AddrReg;
331  Offset = OffsetImm->getImm();
332 
333  if (SOffset) // soffset can be an inline immediate.
334  Offset += SOffset->getImm();
335 
336  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
337  "operands of type register.");
338  return true;
339  }
340 
341  if (isSMRD(LdSt)) {
342  const MachineOperand *OffsetImm =
343  getNamedOperand(LdSt, AMDGPU::OpName::offset);
344  if (!OffsetImm)
345  return false;
346 
347  const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
348  BaseOp = SBaseReg;
349  Offset = OffsetImm->getImm();
350  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
351  "operands of type register.");
352  return true;
353  }
354 
355  if (isFLAT(LdSt)) {
356  const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
357  if (VAddr) {
358  // Can't analyze 2 offsets.
359  if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
360  return false;
361 
362  BaseOp = VAddr;
363  } else {
364  // scratch instructions have either vaddr or saddr.
365  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
366  }
367 
368  Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
369  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
370  "operands of type register.");
371  return true;
372  }
373 
374  return false;
375 }
376 
377 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
378  const MachineOperand &BaseOp1,
379  const MachineInstr &MI2,
380  const MachineOperand &BaseOp2) {
381  // Support only base operands with base registers.
382  // Note: this could be extended to support FI operands.
383  if (!BaseOp1.isReg() || !BaseOp2.isReg())
384  return false;
385 
386  if (BaseOp1.isIdenticalTo(BaseOp2))
387  return true;
388 
389  if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
390  return false;
391 
392  auto MO1 = *MI1.memoperands_begin();
393  auto MO2 = *MI2.memoperands_begin();
394  if (MO1->getAddrSpace() != MO2->getAddrSpace())
395  return false;
396 
397  auto Base1 = MO1->getValue();
398  auto Base2 = MO2->getValue();
399  if (!Base1 || !Base2)
400  return false;
401  const MachineFunction &MF = *MI1.getParent()->getParent();
402  const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
403  Base1 = GetUnderlyingObject(Base1, DL);
404  Base2 = GetUnderlyingObject(Base1, DL);
405 
406  if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
407  return false;
408 
409  return Base1 == Base2;
410 }
411 
413  const MachineOperand &BaseOp2,
414  unsigned NumLoads) const {
415  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
416  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
417 
418  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
419  return false;
420 
421  const MachineOperand *FirstDst = nullptr;
422  const MachineOperand *SecondDst = nullptr;
423 
424  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
425  (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
426  (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
427  const unsigned MaxGlobalLoadCluster = 6;
428  if (NumLoads > MaxGlobalLoadCluster)
429  return false;
430 
431  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
432  if (!FirstDst)
433  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
434  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
435  if (!SecondDst)
436  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
437  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
438  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
439  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
440  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
441  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
442  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
443  }
444 
445  if (!FirstDst || !SecondDst)
446  return false;
447 
448  // Try to limit clustering based on the total number of bytes loaded
449  // rather than the number of instructions. This is done to help reduce
450  // register pressure. The method used is somewhat inexact, though,
451  // because it assumes that all loads in the cluster will load the
452  // same number of bytes as FirstLdSt.
453 
454  // The unit of this value is bytes.
455  // FIXME: This needs finer tuning.
456  unsigned LoadClusterThreshold = 16;
457 
458  const MachineRegisterInfo &MRI =
459  FirstLdSt.getParent()->getParent()->getRegInfo();
460 
461  const unsigned Reg = FirstDst->getReg();
462 
464  ? MRI.getRegClass(Reg)
465  : RI.getPhysRegClass(Reg);
466 
467  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
468 }
469 
470 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
471 // the first 16 loads will be interleaved with the stores, and the next 16 will
472 // be clustered as expected. It should really split into 2 16 store batches.
473 //
474 // Loads are clustered until this returns false, rather than trying to schedule
475 // groups of stores. This also means we have to deal with saying different
476 // address space loads should be clustered, and ones which might cause bank
477 // conflicts.
478 //
479 // This might be deprecated so it might not be worth that much effort to fix.
481  int64_t Offset0, int64_t Offset1,
482  unsigned NumLoads) const {
483  assert(Offset1 > Offset0 &&
484  "Second offset should be larger than first offset!");
485  // If we have less than 16 loads in a row, and the offsets are within 64
486  // bytes, then schedule together.
487 
488  // A cacheline is 64 bytes (for global memory).
489  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
490 }
491 
494  const DebugLoc &DL, unsigned DestReg,
495  unsigned SrcReg, bool KillSrc) {
496  MachineFunction *MF = MBB.getParent();
497  DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
498  "illegal SGPR to VGPR copy",
499  DL, DS_Error);
500  LLVMContext &C = MF->getFunction().getContext();
501  C.diagnose(IllegalCopy);
502 
503  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
504  .addReg(SrcReg, getKillRegState(KillSrc));
505 }
506 
509  const DebugLoc &DL, unsigned DestReg,
510  unsigned SrcReg, bool KillSrc) const {
511  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
512 
513  if (RC == &AMDGPU::VGPR_32RegClass) {
514  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
515  AMDGPU::SReg_32RegClass.contains(SrcReg));
516  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
517  .addReg(SrcReg, getKillRegState(KillSrc));
518  return;
519  }
520 
521  if (RC == &AMDGPU::SReg_32_XM0RegClass ||
522  RC == &AMDGPU::SReg_32RegClass) {
523  if (SrcReg == AMDGPU::SCC) {
524  BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
525  .addImm(-1)
526  .addImm(0);
527  return;
528  }
529 
530  if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
531  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
532  return;
533  }
534 
535  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
536  .addReg(SrcReg, getKillRegState(KillSrc));
537  return;
538  }
539 
540  if (RC == &AMDGPU::SReg_64RegClass) {
541  if (DestReg == AMDGPU::VCC) {
542  if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
543  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
544  .addReg(SrcReg, getKillRegState(KillSrc));
545  } else {
546  // FIXME: Hack until VReg_1 removed.
547  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
548  BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
549  .addImm(0)
550  .addReg(SrcReg, getKillRegState(KillSrc));
551  }
552 
553  return;
554  }
555 
556  if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
557  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
558  return;
559  }
560 
561  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
562  .addReg(SrcReg, getKillRegState(KillSrc));
563  return;
564  }
565 
566  if (DestReg == AMDGPU::SCC) {
567  assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
568  BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
569  .addReg(SrcReg, getKillRegState(KillSrc))
570  .addImm(0);
571  return;
572  }
573 
574  unsigned EltSize = 4;
575  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
576  if (RI.isSGPRClass(RC)) {
577  // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32.
578  if (!(RI.getRegSizeInBits(*RC) % 64)) {
579  Opcode = AMDGPU::S_MOV_B64;
580  EltSize = 8;
581  } else {
582  Opcode = AMDGPU::S_MOV_B32;
583  EltSize = 4;
584  }
585 
586  if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
587  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
588  return;
589  }
590  }
591 
592  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
593  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
594 
595  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
596  unsigned SubIdx;
597  if (Forward)
598  SubIdx = SubIndices[Idx];
599  else
600  SubIdx = SubIndices[SubIndices.size() - Idx - 1];
601 
602  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
603  get(Opcode), RI.getSubReg(DestReg, SubIdx));
604 
605  Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
606 
607  if (Idx == 0)
608  Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
609 
610  bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
611  Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
612  }
613 }
614 
615 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
616  int NewOpc;
617 
618  // Try to map original to commuted opcode
619  NewOpc = AMDGPU::getCommuteRev(Opcode);
620  if (NewOpc != -1)
621  // Check if the commuted (REV) opcode exists on the target.
622  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
623 
624  // Try to map commuted to original opcode
625  NewOpc = AMDGPU::getCommuteOrig(Opcode);
626  if (NewOpc != -1)
627  // Check if the original (non-REV) opcode exists on the target.
628  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
629 
630  return Opcode;
631 }
632 
635  const DebugLoc &DL, unsigned DestReg,
636  int64_t Value) const {
638  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
639  if (RegClass == &AMDGPU::SReg_32RegClass ||
640  RegClass == &AMDGPU::SGPR_32RegClass ||
641  RegClass == &AMDGPU::SReg_32_XM0RegClass ||
642  RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
643  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
644  .addImm(Value);
645  return;
646  }
647 
648  if (RegClass == &AMDGPU::SReg_64RegClass ||
649  RegClass == &AMDGPU::SGPR_64RegClass ||
650  RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
651  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
652  .addImm(Value);
653  return;
654  }
655 
656  if (RegClass == &AMDGPU::VGPR_32RegClass) {
657  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
658  .addImm(Value);
659  return;
660  }
661  if (RegClass == &AMDGPU::VReg_64RegClass) {
662  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
663  .addImm(Value);
664  return;
665  }
666 
667  unsigned EltSize = 4;
668  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
669  if (RI.isSGPRClass(RegClass)) {
670  if (RI.getRegSizeInBits(*RegClass) > 32) {
671  Opcode = AMDGPU::S_MOV_B64;
672  EltSize = 8;
673  } else {
674  Opcode = AMDGPU::S_MOV_B32;
675  EltSize = 4;
676  }
677  }
678 
679  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
680  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
681  int64_t IdxValue = Idx == 0 ? Value : 0;
682 
683  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
684  get(Opcode), RI.getSubReg(DestReg, Idx));
685  Builder.addImm(IdxValue);
686  }
687 }
688 
689 const TargetRegisterClass *
691  return &AMDGPU::VGPR_32RegClass;
692 }
693 
696  const DebugLoc &DL, unsigned DstReg,
698  unsigned TrueReg,
699  unsigned FalseReg) const {
701  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
702  "Not a VGPR32 reg");
703 
704  if (Cond.size() == 1) {
705  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
706  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
707  .add(Cond[0]);
708  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
709  .addImm(0)
710  .addReg(FalseReg)
711  .addImm(0)
712  .addReg(TrueReg)
713  .addReg(SReg);
714  } else if (Cond.size() == 2) {
715  assert(Cond[0].isImm() && "Cond[0] is not an immediate");
716  switch (Cond[0].getImm()) {
717  case SIInstrInfo::SCC_TRUE: {
718  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
719  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
720  .addImm(-1)
721  .addImm(0);
722  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
723  .addImm(0)
724  .addReg(FalseReg)
725  .addImm(0)
726  .addReg(TrueReg)
727  .addReg(SReg);
728  break;
729  }
730  case SIInstrInfo::SCC_FALSE: {
731  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
732  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
733  .addImm(0)
734  .addImm(-1);
735  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
736  .addImm(0)
737  .addReg(FalseReg)
738  .addImm(0)
739  .addReg(TrueReg)
740  .addReg(SReg);
741  break;
742  }
743  case SIInstrInfo::VCCNZ: {
744  MachineOperand RegOp = Cond[1];
745  RegOp.setImplicit(false);
746  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
747  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
748  .add(RegOp);
749  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
750  .addImm(0)
751  .addReg(FalseReg)
752  .addImm(0)
753  .addReg(TrueReg)
754  .addReg(SReg);
755  break;
756  }
757  case SIInstrInfo::VCCZ: {
758  MachineOperand RegOp = Cond[1];
759  RegOp.setImplicit(false);
760  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
761  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
762  .add(RegOp);
763  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
764  .addImm(0)
765  .addReg(TrueReg)
766  .addImm(0)
767  .addReg(FalseReg)
768  .addReg(SReg);
769  break;
770  }
771  case SIInstrInfo::EXECNZ: {
772  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
773  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
774  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
775  .addImm(0);
776  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
777  .addImm(-1)
778  .addImm(0);
779  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
780  .addImm(0)
781  .addReg(FalseReg)
782  .addImm(0)
783  .addReg(TrueReg)
784  .addReg(SReg);
785  break;
786  }
787  case SIInstrInfo::EXECZ: {
788  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
789  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
790  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
791  .addImm(0);
792  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
793  .addImm(0)
794  .addImm(-1);
795  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
796  .addImm(0)
797  .addReg(FalseReg)
798  .addImm(0)
799  .addReg(TrueReg)
800  .addReg(SReg);
801  llvm_unreachable("Unhandled branch predicate EXECZ");
802  break;
803  }
804  default:
805  llvm_unreachable("invalid branch predicate");
806  }
807  } else {
808  llvm_unreachable("Can only handle Cond size 1 or 2");
809  }
810 }
811 
814  const DebugLoc &DL,
815  unsigned SrcReg, int Value) const {
817  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
818  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
819  .addImm(Value)
820  .addReg(SrcReg);
821 
822  return Reg;
823 }
824 
827  const DebugLoc &DL,
828  unsigned SrcReg, int Value) const {
830  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
831  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
832  .addImm(Value)
833  .addReg(SrcReg);
834 
835  return Reg;
836 }
837 
838 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
839 
840  if (RI.getRegSizeInBits(*DstRC) == 32) {
841  return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
842  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
843  return AMDGPU::S_MOV_B64;
844  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
845  return AMDGPU::V_MOV_B64_PSEUDO;
846  }
847  return AMDGPU::COPY;
848 }
849 
850 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
851  switch (Size) {
852  case 4:
853  return AMDGPU::SI_SPILL_S32_SAVE;
854  case 8:
855  return AMDGPU::SI_SPILL_S64_SAVE;
856  case 12:
857  return AMDGPU::SI_SPILL_S96_SAVE;
858  case 16:
859  return AMDGPU::SI_SPILL_S128_SAVE;
860  case 20:
861  return AMDGPU::SI_SPILL_S160_SAVE;
862  case 32:
863  return AMDGPU::SI_SPILL_S256_SAVE;
864  case 64:
865  return AMDGPU::SI_SPILL_S512_SAVE;
866  default:
867  llvm_unreachable("unknown register size");
868  }
869 }
870 
871 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
872  switch (Size) {
873  case 4:
874  return AMDGPU::SI_SPILL_V32_SAVE;
875  case 8:
876  return AMDGPU::SI_SPILL_V64_SAVE;
877  case 12:
878  return AMDGPU::SI_SPILL_V96_SAVE;
879  case 16:
880  return AMDGPU::SI_SPILL_V128_SAVE;
881  case 20:
882  return AMDGPU::SI_SPILL_V160_SAVE;
883  case 32:
884  return AMDGPU::SI_SPILL_V256_SAVE;
885  case 64:
886  return AMDGPU::SI_SPILL_V512_SAVE;
887  default:
888  llvm_unreachable("unknown register size");
889  }
890 }
891 
894  unsigned SrcReg, bool isKill,
895  int FrameIndex,
896  const TargetRegisterClass *RC,
897  const TargetRegisterInfo *TRI) const {
898  MachineFunction *MF = MBB.getParent();
900  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
901  const DebugLoc &DL = MBB.findDebugLoc(MI);
902 
903  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
904  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
905  MachinePointerInfo PtrInfo
906  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
907  MachineMemOperand *MMO
909  Size, Align);
910  unsigned SpillSize = TRI->getSpillSize(*RC);
911 
912  if (RI.isSGPRClass(RC)) {
913  MFI->setHasSpilledSGPRs();
914 
915  // We are only allowed to create one new instruction when spilling
916  // registers, so we need to use pseudo instruction for spilling SGPRs.
917  const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
918 
919  // The SGPR spill/restore instructions only work on number sgprs, so we need
920  // to make sure we are using the correct register class.
921  if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
923  MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
924  }
925 
926  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
927  .addReg(SrcReg, getKillRegState(isKill)) // data
928  .addFrameIndex(FrameIndex) // addr
929  .addMemOperand(MMO)
931  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
932  // Add the scratch resource registers as implicit uses because we may end up
933  // needing them, and need to ensure that the reserved registers are
934  // correctly handled.
935 
936  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
937  if (ST.hasScalarStores()) {
938  // m0 is used for offset to scalar stores if used to spill.
939  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
940  }
941 
942  return;
943  }
944 
945  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
946 
947  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
948  MFI->setHasSpilledVGPRs();
949  BuildMI(MBB, MI, DL, get(Opcode))
950  .addReg(SrcReg, getKillRegState(isKill)) // data
951  .addFrameIndex(FrameIndex) // addr
952  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
953  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
954  .addImm(0) // offset
955  .addMemOperand(MMO);
956 }
957 
958 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
959  switch (Size) {
960  case 4:
961  return AMDGPU::SI_SPILL_S32_RESTORE;
962  case 8:
963  return AMDGPU::SI_SPILL_S64_RESTORE;
964  case 12:
965  return AMDGPU::SI_SPILL_S96_RESTORE;
966  case 16:
967  return AMDGPU::SI_SPILL_S128_RESTORE;
968  case 20:
969  return AMDGPU::SI_SPILL_S160_RESTORE;
970  case 32:
971  return AMDGPU::SI_SPILL_S256_RESTORE;
972  case 64:
973  return AMDGPU::SI_SPILL_S512_RESTORE;
974  default:
975  llvm_unreachable("unknown register size");
976  }
977 }
978 
979 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
980  switch (Size) {
981  case 4:
982  return AMDGPU::SI_SPILL_V32_RESTORE;
983  case 8:
984  return AMDGPU::SI_SPILL_V64_RESTORE;
985  case 12:
986  return AMDGPU::SI_SPILL_V96_RESTORE;
987  case 16:
988  return AMDGPU::SI_SPILL_V128_RESTORE;
989  case 20:
990  return AMDGPU::SI_SPILL_V160_RESTORE;
991  case 32:
992  return AMDGPU::SI_SPILL_V256_RESTORE;
993  case 64:
994  return AMDGPU::SI_SPILL_V512_RESTORE;
995  default:
996  llvm_unreachable("unknown register size");
997  }
998 }
999 
1002  unsigned DestReg, int FrameIndex,
1003  const TargetRegisterClass *RC,
1004  const TargetRegisterInfo *TRI) const {
1005  MachineFunction *MF = MBB.getParent();
1007  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1008  const DebugLoc &DL = MBB.findDebugLoc(MI);
1009  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
1010  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
1011  unsigned SpillSize = TRI->getSpillSize(*RC);
1012 
1013  MachinePointerInfo PtrInfo
1014  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1015 
1017  PtrInfo, MachineMemOperand::MOLoad, Size, Align);
1018 
1019  if (RI.isSGPRClass(RC)) {
1020  MFI->setHasSpilledSGPRs();
1021 
1022  // FIXME: Maybe this should not include a memoperand because it will be
1023  // lowered to non-memory instructions.
1024  const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1025  if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
1027  MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
1028  }
1029 
1030  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
1031  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
1032  .addFrameIndex(FrameIndex) // addr
1033  .addMemOperand(MMO)
1035  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
1036 
1037  if (ST.hasScalarStores()) {
1038  // m0 is used for offset to scalar stores if used to spill.
1039  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
1040  }
1041 
1042  return;
1043  }
1044 
1045  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
1046 
1047  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
1048  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1049  .addFrameIndex(FrameIndex) // vaddr
1050  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
1051  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
1052  .addImm(0) // offset
1053  .addMemOperand(MMO);
1054 }
1055 
1056 /// \param @Offset Offset in bytes of the FrameIndex being spilled
1058  MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
1059  unsigned FrameOffset, unsigned Size) const {
1060  MachineFunction *MF = MBB.getParent();
1062  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1063  const DebugLoc &DL = MBB.findDebugLoc(MI);
1064  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
1065  unsigned WavefrontSize = ST.getWavefrontSize();
1066 
1067  unsigned TIDReg = MFI->getTIDReg();
1068  if (!MFI->hasCalculatedTID()) {
1069  MachineBasicBlock &Entry = MBB.getParent()->front();
1070  MachineBasicBlock::iterator Insert = Entry.front();
1071  const DebugLoc &DL = Insert->getDebugLoc();
1072 
1073  TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
1074  *MF);
1075  if (TIDReg == AMDGPU::NoRegister)
1076  return TIDReg;
1077 
1079  WorkGroupSize > WavefrontSize) {
1080  unsigned TIDIGXReg
1082  unsigned TIDIGYReg
1084  unsigned TIDIGZReg
1086  unsigned InputPtrReg =
1088  for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
1089  if (!Entry.isLiveIn(Reg))
1090  Entry.addLiveIn(Reg);
1091  }
1092 
1093  RS->enterBasicBlock(Entry);
1094  // FIXME: Can we scavenge an SReg_64 and access the subregs?
1095  unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1096  unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1097  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
1098  .addReg(InputPtrReg)
1100  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
1101  .addReg(InputPtrReg)
1103 
1104  // NGROUPS.X * NGROUPS.Y
1105  BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
1106  .addReg(STmp1)
1107  .addReg(STmp0);
1108  // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1109  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1110  .addReg(STmp1)
1111  .addReg(TIDIGXReg);
1112  // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1113  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1114  .addReg(STmp0)
1115  .addReg(TIDIGYReg)
1116  .addReg(TIDReg);
1117  // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1118  getAddNoCarry(Entry, Insert, DL, TIDReg)
1119  .addReg(TIDReg)
1120  .addReg(TIDIGZReg)
1121  .addImm(0); // clamp bit
1122  } else {
1123  // Get the wave id
1124  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1125  TIDReg)
1126  .addImm(-1)
1127  .addImm(0);
1128 
1129  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1130  TIDReg)
1131  .addImm(-1)
1132  .addReg(TIDReg);
1133  }
1134 
1135  BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1136  TIDReg)
1137  .addImm(2)
1138  .addReg(TIDReg);
1139  MFI->setTIDReg(TIDReg);
1140  }
1141 
1142  // Add FrameIndex to LDS offset
1143  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1144  getAddNoCarry(MBB, MI, DL, TmpReg)
1145  .addImm(LDSOffset)
1146  .addReg(TIDReg)
1147  .addImm(0); // clamp bit
1148 
1149  return TmpReg;
1150 }
1151 
1154  int Count) const {
1155  DebugLoc DL = MBB.findDebugLoc(MI);
1156  while (Count > 0) {
1157  int Arg;
1158  if (Count >= 8)
1159  Arg = 7;
1160  else
1161  Arg = Count - 1;
1162  Count -= 8;
1163  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1164  .addImm(Arg);
1165  }
1166 }
1167 
1170  insertWaitStates(MBB, MI, 1);
1171 }
1172 
1174  auto MF = MBB.getParent();
1176 
1177  assert(Info->isEntryFunction());
1178 
1179  if (MBB.succ_empty()) {
1180  bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1181  if (HasNoTerminator) {
1182  if (Info->returnsVoid()) {
1183  BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1184  } else {
1185  BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1186  }
1187  }
1188  }
1189 }
1190 
1192  switch (MI.getOpcode()) {
1193  default: return 1; // FIXME: Do wait states equal cycles?
1194 
1195  case AMDGPU::S_NOP:
1196  return MI.getOperand(0).getImm() + 1;
1197  }
1198 }
1199 
1201  MachineBasicBlock &MBB = *MI.getParent();
1202  DebugLoc DL = MBB.findDebugLoc(MI);
1203  switch (MI.getOpcode()) {
1204  default: return TargetInstrInfo::expandPostRAPseudo(MI);
1205  case AMDGPU::S_MOV_B64_term:
1206  // This is only a terminator to get the correct spill code placement during
1207  // register allocation.
1208  MI.setDesc(get(AMDGPU::S_MOV_B64));
1209  break;
1210 
1211  case AMDGPU::S_XOR_B64_term:
1212  // This is only a terminator to get the correct spill code placement during
1213  // register allocation.
1214  MI.setDesc(get(AMDGPU::S_XOR_B64));
1215  break;
1216 
1217  case AMDGPU::S_OR_B64_term:
1218  // This is only a terminator to get the correct spill code placement during
1219  // register allocation.
1220  MI.setDesc(get(AMDGPU::S_OR_B64));
1221  break;
1222 
1223  case AMDGPU::S_ANDN2_B64_term:
1224  // This is only a terminator to get the correct spill code placement during
1225  // register allocation.
1226  MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1227  break;
1228 
1229  case AMDGPU::V_MOV_B64_PSEUDO: {
1230  unsigned Dst = MI.getOperand(0).getReg();
1231  unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1232  unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1233 
1234  const MachineOperand &SrcOp = MI.getOperand(1);
1235  // FIXME: Will this work for 64-bit floating point immediates?
1236  assert(!SrcOp.isFPImm());
1237  if (SrcOp.isImm()) {
1238  APInt Imm(64, SrcOp.getImm());
1239  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1240  .addImm(Imm.getLoBits(32).getZExtValue())
1241  .addReg(Dst, RegState::Implicit | RegState::Define);
1242  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1243  .addImm(Imm.getHiBits(32).getZExtValue())
1244  .addReg(Dst, RegState::Implicit | RegState::Define);
1245  } else {
1246  assert(SrcOp.isReg());
1247  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1248  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1250  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1251  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1253  }
1254  MI.eraseFromParent();
1255  break;
1256  }
1257  case AMDGPU::V_SET_INACTIVE_B32: {
1258  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1259  .addReg(AMDGPU::EXEC);
1260  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1261  .add(MI.getOperand(2));
1262  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1263  .addReg(AMDGPU::EXEC);
1264  MI.eraseFromParent();
1265  break;
1266  }
1267  case AMDGPU::V_SET_INACTIVE_B64: {
1268  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1269  .addReg(AMDGPU::EXEC);
1270  MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1271  MI.getOperand(0).getReg())
1272  .add(MI.getOperand(2));
1273  expandPostRAPseudo(*Copy);
1274  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1275  .addReg(AMDGPU::EXEC);
1276  MI.eraseFromParent();
1277  break;
1278  }
1279  case AMDGPU::V_MOVRELD_B32_V1:
1280  case AMDGPU::V_MOVRELD_B32_V2:
1281  case AMDGPU::V_MOVRELD_B32_V4:
1282  case AMDGPU::V_MOVRELD_B32_V8:
1283  case AMDGPU::V_MOVRELD_B32_V16: {
1284  const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1285  unsigned VecReg = MI.getOperand(0).getReg();
1286  bool IsUndef = MI.getOperand(1).isUndef();
1287  unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1288  assert(VecReg == MI.getOperand(1).getReg());
1289 
1290  MachineInstr *MovRel =
1291  BuildMI(MBB, MI, DL, MovRelDesc)
1292  .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1293  .add(MI.getOperand(2))
1294  .addReg(VecReg, RegState::ImplicitDefine)
1295  .addReg(VecReg,
1296  RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1297 
1298  const int ImpDefIdx =
1299  MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1300  const int ImpUseIdx = ImpDefIdx + 1;
1301  MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1302 
1303  MI.eraseFromParent();
1304  break;
1305  }
1306  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1307  MachineFunction &MF = *MBB.getParent();
1308  unsigned Reg = MI.getOperand(0).getReg();
1309  unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1310  unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1311 
1312  // Create a bundle so these instructions won't be re-ordered by the
1313  // post-RA scheduler.
1314  MIBundleBuilder Bundler(MBB, MI);
1315  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1316 
1317  // Add 32-bit offset from this instruction to the start of the
1318  // constant data.
1319  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1320  .addReg(RegLo)
1321  .add(MI.getOperand(1)));
1322 
1323  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1324  .addReg(RegHi);
1326  MIB.addImm(0);
1327  else
1328  MIB.add(MI.getOperand(2));
1329 
1330  Bundler.append(MIB);
1331  finalizeBundle(MBB, Bundler.begin());
1332 
1333  MI.eraseFromParent();
1334  break;
1335  }
1336  case AMDGPU::ENTER_WWM: {
1337  // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1338  // WWM is entered.
1339  MI.setDesc(get(AMDGPU::S_OR_SAVEEXEC_B64));
1340  break;
1341  }
1342  case AMDGPU::EXIT_WWM: {
1343  // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1344  // WWM is exited.
1345  MI.setDesc(get(AMDGPU::S_MOV_B64));
1346  break;
1347  }
1348  case TargetOpcode::BUNDLE: {
1349  if (!MI.mayLoad())
1350  return false;
1351 
1352  // If it is a load it must be a memory clause
1354  I->isBundledWithSucc(); ++I) {
1355  I->unbundleFromSucc();
1356  for (MachineOperand &MO : I->operands())
1357  if (MO.isReg())
1358  MO.setIsInternalRead(false);
1359  }
1360 
1361  MI.eraseFromParent();
1362  break;
1363  }
1364  }
1365  return true;
1366 }
1367 
1369  MachineOperand &Src0,
1370  unsigned Src0OpName,
1371  MachineOperand &Src1,
1372  unsigned Src1OpName) const {
1373  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1374  if (!Src0Mods)
1375  return false;
1376 
1377  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1378  assert(Src1Mods &&
1379  "All commutable instructions have both src0 and src1 modifiers");
1380 
1381  int Src0ModsVal = Src0Mods->getImm();
1382  int Src1ModsVal = Src1Mods->getImm();
1383 
1384  Src1Mods->setImm(Src0ModsVal);
1385  Src0Mods->setImm(Src1ModsVal);
1386  return true;
1387 }
1388 
1390  MachineOperand &RegOp,
1391  MachineOperand &NonRegOp) {
1392  unsigned Reg = RegOp.getReg();
1393  unsigned SubReg = RegOp.getSubReg();
1394  bool IsKill = RegOp.isKill();
1395  bool IsDead = RegOp.isDead();
1396  bool IsUndef = RegOp.isUndef();
1397  bool IsDebug = RegOp.isDebug();
1398 
1399  if (NonRegOp.isImm())
1400  RegOp.ChangeToImmediate(NonRegOp.getImm());
1401  else if (NonRegOp.isFI())
1402  RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1403  else
1404  return nullptr;
1405 
1406  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1407  NonRegOp.setSubReg(SubReg);
1408 
1409  return &MI;
1410 }
1411 
1413  unsigned Src0Idx,
1414  unsigned Src1Idx) const {
1415  assert(!NewMI && "this should never be used");
1416 
1417  unsigned Opc = MI.getOpcode();
1418  int CommutedOpcode = commuteOpcode(Opc);
1419  if (CommutedOpcode == -1)
1420  return nullptr;
1421 
1422  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1423  static_cast<int>(Src0Idx) &&
1424  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1425  static_cast<int>(Src1Idx) &&
1426  "inconsistency with findCommutedOpIndices");
1427 
1428  MachineOperand &Src0 = MI.getOperand(Src0Idx);
1429  MachineOperand &Src1 = MI.getOperand(Src1Idx);
1430 
1431  MachineInstr *CommutedMI = nullptr;
1432  if (Src0.isReg() && Src1.isReg()) {
1433  if (isOperandLegal(MI, Src1Idx, &Src0)) {
1434  // Be sure to copy the source modifiers to the right place.
1435  CommutedMI
1436  = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1437  }
1438 
1439  } else if (Src0.isReg() && !Src1.isReg()) {
1440  // src0 should always be able to support any operand type, so no need to
1441  // check operand legality.
1442  CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1443  } else if (!Src0.isReg() && Src1.isReg()) {
1444  if (isOperandLegal(MI, Src1Idx, &Src0))
1445  CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1446  } else {
1447  // FIXME: Found two non registers to commute. This does happen.
1448  return nullptr;
1449  }
1450 
1451  if (CommutedMI) {
1452  swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1453  Src1, AMDGPU::OpName::src1_modifiers);
1454 
1455  CommutedMI->setDesc(get(CommutedOpcode));
1456  }
1457 
1458  return CommutedMI;
1459 }
1460 
1461 // This needs to be implemented because the source modifiers may be inserted
1462 // between the true commutable operands, and the base
1463 // TargetInstrInfo::commuteInstruction uses it.
1465  unsigned &SrcOpIdx1) const {
1466  return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
1467 }
1468 
1469 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
1470  unsigned &SrcOpIdx1) const {
1471  if (!Desc.isCommutable())
1472  return false;
1473 
1474  unsigned Opc = Desc.getOpcode();
1475  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1476  if (Src0Idx == -1)
1477  return false;
1478 
1479  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1480  if (Src1Idx == -1)
1481  return false;
1482 
1483  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1484 }
1485 
1486 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1487  int64_t BrOffset) const {
1488  // BranchRelaxation should never have to check s_setpc_b64 because its dest
1489  // block is unanalyzable.
1490  assert(BranchOp != AMDGPU::S_SETPC_B64);
1491 
1492  // Convert to dwords.
1493  BrOffset /= 4;
1494 
1495  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1496  // from the next instruction.
1497  BrOffset -= 1;
1498 
1499  return isIntN(BranchOffsetBits, BrOffset);
1500 }
1501 
1503  const MachineInstr &MI) const {
1504  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1505  // This would be a difficult analysis to perform, but can always be legal so
1506  // there's no need to analyze it.
1507  return nullptr;
1508  }
1509 
1510  return MI.getOperand(0).getMBB();
1511 }
1512 
1514  MachineBasicBlock &DestBB,
1515  const DebugLoc &DL,
1516  int64_t BrOffset,
1517  RegScavenger *RS) const {
1518  assert(RS && "RegScavenger required for long branching");
1519  assert(MBB.empty() &&
1520  "new block should be inserted for expanding unconditional branch");
1521  assert(MBB.pred_size() == 1);
1522 
1523  MachineFunction *MF = MBB.getParent();
1524  MachineRegisterInfo &MRI = MF->getRegInfo();
1525 
1526  // FIXME: Virtual register workaround for RegScavenger not working with empty
1527  // blocks.
1528  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1529 
1530  auto I = MBB.end();
1531 
1532  // We need to compute the offset relative to the instruction immediately after
1533  // s_getpc_b64. Insert pc arithmetic code before last terminator.
1534  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1535 
1536  // TODO: Handle > 32-bit block address.
1537  if (BrOffset >= 0) {
1538  BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1539  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1540  .addReg(PCReg, 0, AMDGPU::sub0)
1542  BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1543  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1544  .addReg(PCReg, 0, AMDGPU::sub1)
1545  .addImm(0);
1546  } else {
1547  // Backwards branch.
1548  BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1549  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1550  .addReg(PCReg, 0, AMDGPU::sub0)
1552  BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1553  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1554  .addReg(PCReg, 0, AMDGPU::sub1)
1555  .addImm(0);
1556  }
1557 
1558  // Insert the indirect branch after the other terminator.
1559  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1560  .addReg(PCReg);
1561 
1562  // FIXME: If spilling is necessary, this will fail because this scavenger has
1563  // no emergency stack slots. It is non-trivial to spill in this situation,
1564  // because the restore code needs to be specially placed after the
1565  // jump. BranchRelaxation then needs to be made aware of the newly inserted
1566  // block.
1567  //
1568  // If a spill is needed for the pc register pair, we need to insert a spill
1569  // restore block right before the destination block, and insert a short branch
1570  // into the old destination block's fallthrough predecessor.
1571  // e.g.:
1572  //
1573  // s_cbranch_scc0 skip_long_branch:
1574  //
1575  // long_branch_bb:
1576  // spill s[8:9]
1577  // s_getpc_b64 s[8:9]
1578  // s_add_u32 s8, s8, restore_bb
1579  // s_addc_u32 s9, s9, 0
1580  // s_setpc_b64 s[8:9]
1581  //
1582  // skip_long_branch:
1583  // foo;
1584  //
1585  // .....
1586  //
1587  // dest_bb_fallthrough_predecessor:
1588  // bar;
1589  // s_branch dest_bb
1590  //
1591  // restore_bb:
1592  // restore s[8:9]
1593  // fallthrough dest_bb
1594  ///
1595  // dest_bb:
1596  // buzz;
1597 
1598  RS->enterBasicBlockEnd(MBB);
1599  unsigned Scav = RS->scavengeRegisterBackwards(
1600  AMDGPU::SReg_64RegClass,
1601  MachineBasicBlock::iterator(GetPC), false, 0);
1602  MRI.replaceRegWith(PCReg, Scav);
1603  MRI.clearVirtRegs();
1604  RS->setRegUsed(Scav);
1605 
1606  return 4 + 8 + 4 + 4;
1607 }
1608 
1609 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1610  switch (Cond) {
1611  case SIInstrInfo::SCC_TRUE:
1612  return AMDGPU::S_CBRANCH_SCC1;
1613  case SIInstrInfo::SCC_FALSE:
1614  return AMDGPU::S_CBRANCH_SCC0;
1615  case SIInstrInfo::VCCNZ:
1616  return AMDGPU::S_CBRANCH_VCCNZ;
1617  case SIInstrInfo::VCCZ:
1618  return AMDGPU::S_CBRANCH_VCCZ;
1619  case SIInstrInfo::EXECNZ:
1620  return AMDGPU::S_CBRANCH_EXECNZ;
1621  case SIInstrInfo::EXECZ:
1622  return AMDGPU::S_CBRANCH_EXECZ;
1623  default:
1624  llvm_unreachable("invalid branch predicate");
1625  }
1626 }
1627 
1628 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1629  switch (Opcode) {
1630  case AMDGPU::S_CBRANCH_SCC0:
1631  return SCC_FALSE;
1632  case AMDGPU::S_CBRANCH_SCC1:
1633  return SCC_TRUE;
1634  case AMDGPU::S_CBRANCH_VCCNZ:
1635  return VCCNZ;
1636  case AMDGPU::S_CBRANCH_VCCZ:
1637  return VCCZ;
1638  case AMDGPU::S_CBRANCH_EXECNZ:
1639  return EXECNZ;
1640  case AMDGPU::S_CBRANCH_EXECZ:
1641  return EXECZ;
1642  default:
1643  return INVALID_BR;
1644  }
1645 }
1646 
1649  MachineBasicBlock *&TBB,
1650  MachineBasicBlock *&FBB,
1652  bool AllowModify) const {
1653  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1654  // Unconditional Branch
1655  TBB = I->getOperand(0).getMBB();
1656  return false;
1657  }
1658 
1659  MachineBasicBlock *CondBB = nullptr;
1660 
1661  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1662  CondBB = I->getOperand(1).getMBB();
1663  Cond.push_back(I->getOperand(0));
1664  } else {
1665  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1666  if (Pred == INVALID_BR)
1667  return true;
1668 
1669  CondBB = I->getOperand(0).getMBB();
1671  Cond.push_back(I->getOperand(1)); // Save the branch register.
1672  }
1673  ++I;
1674 
1675  if (I == MBB.end()) {
1676  // Conditional branch followed by fall-through.
1677  TBB = CondBB;
1678  return false;
1679  }
1680 
1681  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1682  TBB = CondBB;
1683  FBB = I->getOperand(0).getMBB();
1684  return false;
1685  }
1686 
1687  return true;
1688 }
1689 
1691  MachineBasicBlock *&FBB,
1693  bool AllowModify) const {
1695  auto E = MBB.end();
1696  if (I == E)
1697  return false;
1698 
1699  // Skip over the instructions that are artificially terminators for special
1700  // exec management.
1701  while (I != E && !I->isBranch() && !I->isReturn() &&
1702  I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
1703  switch (I->getOpcode()) {
1704  case AMDGPU::SI_MASK_BRANCH:
1705  case AMDGPU::S_MOV_B64_term:
1706  case AMDGPU::S_XOR_B64_term:
1707  case AMDGPU::S_OR_B64_term:
1708  case AMDGPU::S_ANDN2_B64_term:
1709  break;
1710  case AMDGPU::SI_IF:
1711  case AMDGPU::SI_ELSE:
1712  case AMDGPU::SI_KILL_I1_TERMINATOR:
1713  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1714  // FIXME: It's messy that these need to be considered here at all.
1715  return true;
1716  default:
1717  llvm_unreachable("unexpected non-branch terminator inst");
1718  }
1719 
1720  ++I;
1721  }
1722 
1723  if (I == E)
1724  return false;
1725 
1726  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1727  return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1728 
1729  ++I;
1730 
1731  // TODO: Should be able to treat as fallthrough?
1732  if (I == MBB.end())
1733  return true;
1734 
1735  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1736  return true;
1737 
1738  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1739 
1740  // Specifically handle the case where the conditional branch is to the same
1741  // destination as the mask branch. e.g.
1742  //
1743  // si_mask_branch BB8
1744  // s_cbranch_execz BB8
1745  // s_cbranch BB9
1746  //
1747  // This is required to understand divergent loops which may need the branches
1748  // to be relaxed.
1749  if (TBB != MaskBrDest || Cond.empty())
1750  return true;
1751 
1752  auto Pred = Cond[0].getImm();
1753  return (Pred != EXECZ && Pred != EXECNZ);
1754 }
1755 
1757  int *BytesRemoved) const {
1759 
1760  unsigned Count = 0;
1761  unsigned RemovedSize = 0;
1762  while (I != MBB.end()) {
1763  MachineBasicBlock::iterator Next = std::next(I);
1764  if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1765  I = Next;
1766  continue;
1767  }
1768 
1769  RemovedSize += getInstSizeInBytes(*I);
1770  I->eraseFromParent();
1771  ++Count;
1772  I = Next;
1773  }
1774 
1775  if (BytesRemoved)
1776  *BytesRemoved = RemovedSize;
1777 
1778  return Count;
1779 }
1780 
1781 // Copy the flags onto the implicit condition register operand.
1783  const MachineOperand &OrigCond) {
1784  CondReg.setIsUndef(OrigCond.isUndef());
1785  CondReg.setIsKill(OrigCond.isKill());
1786 }
1787 
1789  MachineBasicBlock *TBB,
1790  MachineBasicBlock *FBB,
1792  const DebugLoc &DL,
1793  int *BytesAdded) const {
1794  if (!FBB && Cond.empty()) {
1795  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1796  .addMBB(TBB);
1797  if (BytesAdded)
1798  *BytesAdded = 4;
1799  return 1;
1800  }
1801 
1802  if(Cond.size() == 1 && Cond[0].isReg()) {
1803  BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1804  .add(Cond[0])
1805  .addMBB(TBB);
1806  return 1;
1807  }
1808 
1809  assert(TBB && Cond[0].isImm());
1810 
1811  unsigned Opcode
1812  = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1813 
1814  if (!FBB) {
1815  Cond[1].isUndef();
1816  MachineInstr *CondBr =
1817  BuildMI(&MBB, DL, get(Opcode))
1818  .addMBB(TBB);
1819 
1820  // Copy the flags onto the implicit condition register operand.
1821  preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1822 
1823  if (BytesAdded)
1824  *BytesAdded = 4;
1825  return 1;
1826  }
1827 
1828  assert(TBB && FBB);
1829 
1830  MachineInstr *CondBr =
1831  BuildMI(&MBB, DL, get(Opcode))
1832  .addMBB(TBB);
1833  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1834  .addMBB(FBB);
1835 
1836  MachineOperand &CondReg = CondBr->getOperand(1);
1837  CondReg.setIsUndef(Cond[1].isUndef());
1838  CondReg.setIsKill(Cond[1].isKill());
1839 
1840  if (BytesAdded)
1841  *BytesAdded = 8;
1842 
1843  return 2;
1844 }
1845 
1847  SmallVectorImpl<MachineOperand> &Cond) const {
1848  if (Cond.size() != 2) {
1849  return true;
1850  }
1851 
1852  if (Cond[0].isImm()) {
1853  Cond[0].setImm(-Cond[0].getImm());
1854  return false;
1855  }
1856 
1857  return true;
1858 }
1859 
1862  unsigned TrueReg, unsigned FalseReg,
1863  int &CondCycles,
1864  int &TrueCycles, int &FalseCycles) const {
1865  switch (Cond[0].getImm()) {
1866  case VCCNZ:
1867  case VCCZ: {
1868  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1869  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1870  assert(MRI.getRegClass(FalseReg) == RC);
1871 
1872  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1873  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1874 
1875  // Limit to equal cost for branch vs. N v_cndmask_b32s.
1876  return !RI.isSGPRClass(RC) && NumInsts <= 6;
1877  }
1878  case SCC_TRUE:
1879  case SCC_FALSE: {
1880  // FIXME: We could insert for VGPRs if we could replace the original compare
1881  // with a vector one.
1882  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1883  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1884  assert(MRI.getRegClass(FalseReg) == RC);
1885 
1886  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1887 
1888  // Multiples of 8 can do s_cselect_b64
1889  if (NumInsts % 2 == 0)
1890  NumInsts /= 2;
1891 
1892  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1893  return RI.isSGPRClass(RC);
1894  }
1895  default:
1896  return false;
1897  }
1898 }
1899 
1902  unsigned DstReg, ArrayRef<MachineOperand> Cond,
1903  unsigned TrueReg, unsigned FalseReg) const {
1904  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1905  if (Pred == VCCZ || Pred == SCC_FALSE) {
1906  Pred = static_cast<BranchPredicate>(-Pred);
1907  std::swap(TrueReg, FalseReg);
1908  }
1909 
1911  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1912  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1913 
1914  if (DstSize == 32) {
1915  unsigned SelOp = Pred == SCC_TRUE ?
1916  AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1917 
1918  // Instruction's operands are backwards from what is expected.
1919  MachineInstr *Select =
1920  BuildMI(MBB, I, DL, get(SelOp), DstReg)
1921  .addReg(FalseReg)
1922  .addReg(TrueReg);
1923 
1924  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1925  return;
1926  }
1927 
1928  if (DstSize == 64 && Pred == SCC_TRUE) {
1929  MachineInstr *Select =
1930  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1931  .addReg(FalseReg)
1932  .addReg(TrueReg);
1933 
1934  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1935  return;
1936  }
1937 
1938  static const int16_t Sub0_15[] = {
1939  AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1940  AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1941  AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1942  AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1943  };
1944 
1945  static const int16_t Sub0_15_64[] = {
1946  AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1947  AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1948  AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1949  AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1950  };
1951 
1952  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1953  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1954  const int16_t *SubIndices = Sub0_15;
1955  int NElts = DstSize / 32;
1956 
1957  // 64-bit select is only available for SALU.
1958  // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
1959  if (Pred == SCC_TRUE) {
1960  if (NElts % 2) {
1961  SelOp = AMDGPU::S_CSELECT_B32;
1962  EltRC = &AMDGPU::SGPR_32RegClass;
1963  } else {
1964  SelOp = AMDGPU::S_CSELECT_B64;
1965  EltRC = &AMDGPU::SGPR_64RegClass;
1966  SubIndices = Sub0_15_64;
1967  NElts /= 2;
1968  }
1969  }
1970 
1972  MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1973 
1974  I = MIB->getIterator();
1975 
1977  for (int Idx = 0; Idx != NElts; ++Idx) {
1978  unsigned DstElt = MRI.createVirtualRegister(EltRC);
1979  Regs.push_back(DstElt);
1980 
1981  unsigned SubIdx = SubIndices[Idx];
1982 
1983  MachineInstr *Select =
1984  BuildMI(MBB, I, DL, get(SelOp), DstElt)
1985  .addReg(FalseReg, 0, SubIdx)
1986  .addReg(TrueReg, 0, SubIdx);
1987  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1988 
1989  MIB.addReg(DstElt)
1990  .addImm(SubIdx);
1991  }
1992 }
1993 
1995  switch (MI.getOpcode()) {
1996  case AMDGPU::V_MOV_B32_e32:
1997  case AMDGPU::V_MOV_B32_e64:
1998  case AMDGPU::V_MOV_B64_PSEUDO: {
1999  // If there are additional implicit register operands, this may be used for
2000  // register indexing so the source register operand isn't simply copied.
2001  unsigned NumOps = MI.getDesc().getNumOperands() +
2002  MI.getDesc().getNumImplicitUses();
2003 
2004  return MI.getNumOperands() == NumOps;
2005  }
2006  case AMDGPU::S_MOV_B32:
2007  case AMDGPU::S_MOV_B64:
2008  case AMDGPU::COPY:
2009  return true;
2010  default:
2011  return false;
2012  }
2013 }
2014 
2016  unsigned Kind) const {
2017  switch(Kind) {
2028  }
2029  return AMDGPUAS::FLAT_ADDRESS;
2030 }
2031 
2033  unsigned Opc = MI.getOpcode();
2034  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2035  AMDGPU::OpName::src0_modifiers);
2036  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2037  AMDGPU::OpName::src1_modifiers);
2038  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2039  AMDGPU::OpName::src2_modifiers);
2040 
2041  MI.RemoveOperand(Src2ModIdx);
2042  MI.RemoveOperand(Src1ModIdx);
2043  MI.RemoveOperand(Src0ModIdx);
2044 }
2045 
2047  unsigned Reg, MachineRegisterInfo *MRI) const {
2048  if (!MRI->hasOneNonDBGUse(Reg))
2049  return false;
2050 
2051  switch (DefMI.getOpcode()) {
2052  default:
2053  return false;
2054  case AMDGPU::S_MOV_B64:
2055  // TODO: We could fold 64-bit immediates, but this get compilicated
2056  // when there are sub-registers.
2057  return false;
2058 
2059  case AMDGPU::V_MOV_B32_e32:
2060  case AMDGPU::S_MOV_B32:
2061  break;
2062  }
2063 
2064  const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
2065  assert(ImmOp);
2066  // FIXME: We could handle FrameIndex values here.
2067  if (!ImmOp->isImm())
2068  return false;
2069 
2070  unsigned Opc = UseMI.getOpcode();
2071  if (Opc == AMDGPU::COPY) {
2072  bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
2073  unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2074  UseMI.setDesc(get(NewOpc));
2075  UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
2076  UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2077  return true;
2078  }
2079 
2080  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2081  Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
2082  // Don't fold if we are using source or output modifiers. The new VOP2
2083  // instructions don't have them.
2084  if (hasAnyModifiersSet(UseMI))
2085  return false;
2086 
2087  // If this is a free constant, there's no reason to do this.
2088  // TODO: We could fold this here instead of letting SIFoldOperands do it
2089  // later.
2090  MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2091 
2092  // Any src operand can be used for the legality check.
2093  if (isInlineConstant(UseMI, *Src0, *ImmOp))
2094  return false;
2095 
2096  bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
2097  MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2098  MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2099 
2100  // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2101  // We should only expect these to be on src0 due to canonicalizations.
2102  if (Src0->isReg() && Src0->getReg() == Reg) {
2103  if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2104  return false;
2105 
2106  if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2107  return false;
2108 
2109  // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2110 
2111  const int64_t Imm = ImmOp->getImm();
2112 
2113  // FIXME: This would be a lot easier if we could return a new instruction
2114  // instead of having to modify in place.
2115 
2116  // Remove these first since they are at the end.
2117  UseMI.RemoveOperand(
2118  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2119  UseMI.RemoveOperand(
2120  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2121 
2122  unsigned Src1Reg = Src1->getReg();
2123  unsigned Src1SubReg = Src1->getSubReg();
2124  Src0->setReg(Src1Reg);
2125  Src0->setSubReg(Src1SubReg);
2126  Src0->setIsKill(Src1->isKill());
2127 
2128  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2129  Opc == AMDGPU::V_MAC_F16_e64)
2130  UseMI.untieRegOperand(
2131  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2132 
2133  Src1->ChangeToImmediate(Imm);
2134 
2135  removeModOperands(UseMI);
2136  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
2137 
2138  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2139  if (DeleteDef)
2140  DefMI.eraseFromParent();
2141 
2142  return true;
2143  }
2144 
2145  // Added part is the constant: Use v_madak_{f16, f32}.
2146  if (Src2->isReg() && Src2->getReg() == Reg) {
2147  // Not allowed to use constant bus for another operand.
2148  // We can however allow an inline immediate as src0.
2149  bool Src0Inlined = false;
2150  if (Src0->isReg()) {
2151  // Try to inline constant if possible.
2152  // If the Def moves immediate and the use is single
2153  // We are saving VGPR here.
2154  MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
2155  if (Def && Def->isMoveImmediate() &&
2156  isInlineConstant(Def->getOperand(1)) &&
2157  MRI->hasOneUse(Src0->getReg())) {
2158  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2159  Src0Inlined = true;
2160  } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
2161  RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
2162  (RI.isVirtualRegister(Src0->getReg()) &&
2163  RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
2164  return false;
2165  // VGPR is okay as Src0 - fallthrough
2166  }
2167 
2168  if (Src1->isReg() && !Src0Inlined ) {
2169  // We have one slot for inlinable constant so far - try to fill it
2170  MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
2171  if (Def && Def->isMoveImmediate() &&
2172  isInlineConstant(Def->getOperand(1)) &&
2173  MRI->hasOneUse(Src1->getReg()) &&
2174  commuteInstruction(UseMI)) {
2175  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2176  } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
2177  RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
2178  (RI.isVirtualRegister(Src1->getReg()) &&
2179  RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
2180  return false;
2181  // VGPR is okay as Src1 - fallthrough
2182  }
2183 
2184  const int64_t Imm = ImmOp->getImm();
2185 
2186  // FIXME: This would be a lot easier if we could return a new instruction
2187  // instead of having to modify in place.
2188 
2189  // Remove these first since they are at the end.
2190  UseMI.RemoveOperand(
2191  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2192  UseMI.RemoveOperand(
2193  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2194 
2195  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2196  Opc == AMDGPU::V_MAC_F16_e64)
2197  UseMI.untieRegOperand(
2198  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2199 
2200  // ChangingToImmediate adds Src2 back to the instruction.
2201  Src2->ChangeToImmediate(Imm);
2202 
2203  // These come before src2.
2204  removeModOperands(UseMI);
2205  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
2206 
2207  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2208  if (DeleteDef)
2209  DefMI.eraseFromParent();
2210 
2211  return true;
2212  }
2213  }
2214 
2215  return false;
2216 }
2217 
2218 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2219  int WidthB, int OffsetB) {
2220  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2221  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2222  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2223  return LowOffset + LowWidth <= HighOffset;
2224 }
2225 
2226 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
2227  const MachineInstr &MIb) const {
2228  const MachineOperand *BaseOp0, *BaseOp1;
2229  int64_t Offset0, Offset1;
2230 
2231  if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
2232  getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
2233  if (!BaseOp0->isIdenticalTo(*BaseOp1))
2234  return false;
2235 
2236  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2237  // FIXME: Handle ds_read2 / ds_write2.
2238  return false;
2239  }
2240  unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2241  unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2242  if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
2243  return true;
2244  }
2245  }
2246 
2247  return false;
2248 }
2249 
2251  const MachineInstr &MIb,
2252  AliasAnalysis *AA) const {
2253  assert((MIa.mayLoad() || MIa.mayStore()) &&
2254  "MIa must load from or modify a memory location");
2255  assert((MIb.mayLoad() || MIb.mayStore()) &&
2256  "MIb must load from or modify a memory location");
2257 
2259  return false;
2260 
2261  // XXX - Can we relax this between address spaces?
2262  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2263  return false;
2264 
2265  // TODO: Should we check the address space from the MachineMemOperand? That
2266  // would allow us to distinguish objects we know don't alias based on the
2267  // underlying address space, even if it was lowered to a different one,
2268  // e.g. private accesses lowered to use MUBUF instructions on a scratch
2269  // buffer.
2270  if (isDS(MIa)) {
2271  if (isDS(MIb))
2272  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2273 
2274  return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
2275  }
2276 
2277  if (isMUBUF(MIa) || isMTBUF(MIa)) {
2278  if (isMUBUF(MIb) || isMTBUF(MIb))
2279  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2280 
2281  return !isFLAT(MIb) && !isSMRD(MIb);
2282  }
2283 
2284  if (isSMRD(MIa)) {
2285  if (isSMRD(MIb))
2286  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2287 
2288  return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2289  }
2290 
2291  if (isFLAT(MIa)) {
2292  if (isFLAT(MIb))
2293  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2294 
2295  return false;
2296  }
2297 
2298  return false;
2299 }
2300 
2301 static int64_t getFoldableImm(const MachineOperand* MO) {
2302  if (!MO->isReg())
2303  return false;
2304  const MachineFunction *MF = MO->getParent()->getParent()->getParent();
2305  const MachineRegisterInfo &MRI = MF->getRegInfo();
2306  auto Def = MRI.getUniqueVRegDef(MO->getReg());
2307  if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
2308  Def->getOperand(1).isImm())
2309  return Def->getOperand(1).getImm();
2310  return AMDGPU::NoRegister;
2311 }
2312 
2314  MachineInstr &MI,
2315  LiveVariables *LV) const {
2316  unsigned Opc = MI.getOpcode();
2317  bool IsF16 = false;
2318  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
2319 
2320  switch (Opc) {
2321  default:
2322  return nullptr;
2323  case AMDGPU::V_MAC_F16_e64:
2324  IsF16 = true;
2326  case AMDGPU::V_MAC_F32_e64:
2327  case AMDGPU::V_FMAC_F32_e64:
2328  break;
2329  case AMDGPU::V_MAC_F16_e32:
2330  IsF16 = true;
2332  case AMDGPU::V_MAC_F32_e32:
2333  case AMDGPU::V_FMAC_F32_e32: {
2334  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2335  AMDGPU::OpName::src0);
2336  const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2337  if (!Src0->isReg() && !Src0->isImm())
2338  return nullptr;
2339 
2340  if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2341  return nullptr;
2342 
2343  break;
2344  }
2345  }
2346 
2347  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2348  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2349  const MachineOperand *Src0Mods =
2350  getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2351  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2352  const MachineOperand *Src1Mods =
2353  getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2354  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2355  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2356  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2357 
2358  if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
2359  // If we have an SGPR input, we will violate the constant bus restriction.
2360  (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
2361  if (auto Imm = getFoldableImm(Src2)) {
2362  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2363  get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
2364  .add(*Dst)
2365  .add(*Src0)
2366  .add(*Src1)
2367  .addImm(Imm);
2368  }
2369  if (auto Imm = getFoldableImm(Src1)) {
2370  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2371  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2372  .add(*Dst)
2373  .add(*Src0)
2374  .addImm(Imm)
2375  .add(*Src2);
2376  }
2377  if (auto Imm = getFoldableImm(Src0)) {
2378  if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
2379  AMDGPU::OpName::src0), Src1))
2380  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2381  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2382  .add(*Dst)
2383  .add(*Src1)
2384  .addImm(Imm)
2385  .add(*Src2);
2386  }
2387  }
2388 
2389  assert((!IsFMA || !IsF16) && "fmac only expected with f32");
2390  unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
2391  (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2392  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2393  .add(*Dst)
2394  .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2395  .add(*Src0)
2396  .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2397  .add(*Src1)
2398  .addImm(0) // Src mods
2399  .add(*Src2)
2400  .addImm(Clamp ? Clamp->getImm() : 0)
2401  .addImm(Omod ? Omod->getImm() : 0);
2402 }
2403 
2404 // It's not generally safe to move VALU instructions across these since it will
2405 // start using the register as a base index rather than directly.
2406 // XXX - Why isn't hasSideEffects sufficient for these?
2408  switch (MI.getOpcode()) {
2409  case AMDGPU::S_SET_GPR_IDX_ON:
2410  case AMDGPU::S_SET_GPR_IDX_MODE:
2411  case AMDGPU::S_SET_GPR_IDX_OFF:
2412  return true;
2413  default:
2414  return false;
2415  }
2416 }
2417 
2419  const MachineBasicBlock *MBB,
2420  const MachineFunction &MF) const {
2421  // XXX - Do we want the SP check in the base implementation?
2422 
2423  // Target-independent instructions do not have an implicit-use of EXEC, even
2424  // when they operate on VGPRs. Treating EXEC modifications as scheduling
2425  // boundaries prevents incorrect movements of such instructions.
2426  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2427  MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2428  MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2429  MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2431 }
2432 
2433 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
2434  return Opcode == AMDGPU::DS_ORDERED_COUNT ||
2435  Opcode == AMDGPU::DS_GWS_INIT ||
2436  Opcode == AMDGPU::DS_GWS_SEMA_V ||
2437  Opcode == AMDGPU::DS_GWS_SEMA_BR ||
2438  Opcode == AMDGPU::DS_GWS_SEMA_P ||
2439  Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
2440  Opcode == AMDGPU::DS_GWS_BARRIER;
2441 }
2442 
2444  unsigned Opcode = MI.getOpcode();
2445 
2446  if (MI.mayStore() && isSMRD(MI))
2447  return true; // scalar store or atomic
2448 
2449  // These instructions cause shader I/O that may cause hardware lockups
2450  // when executed with an empty EXEC mask.
2451  //
2452  // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
2453  // EXEC = 0, but checking for that case here seems not worth it
2454  // given the typical code patterns.
2455  if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
2456  Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
2457  Opcode == AMDGPU::DS_ORDERED_COUNT)
2458  return true;
2459 
2460  if (MI.isInlineAsm())
2461  return true; // conservative assumption
2462 
2463  // These are like SALU instructions in terms of effects, so it's questionable
2464  // whether we should return true for those.
2465  //
2466  // However, executing them with EXEC = 0 causes them to operate on undefined
2467  // data, which we avoid by returning true here.
2468  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
2469  return true;
2470 
2471  return false;
2472 }
2473 
2475  const MachineInstr &MI) const {
2476  if (MI.isMetaInstruction())
2477  return false;
2478 
2479  // This won't read exec if this is an SGPR->SGPR copy.
2480  if (MI.isCopyLike()) {
2481  if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
2482  return true;
2483 
2484  // Make sure this isn't copying exec as a normal operand
2485  return MI.readsRegister(AMDGPU::EXEC, &RI);
2486  }
2487 
2488  // Be conservative with any unhandled generic opcodes.
2489  if (!isTargetSpecificOpcode(MI.getOpcode()))
2490  return true;
2491 
2492  return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
2493 }
2494 
2495 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2496  switch (Imm.getBitWidth()) {
2497  case 32:
2499  ST.hasInv2PiInlineImm());
2500  case 64:
2502  ST.hasInv2PiInlineImm());
2503  case 16:
2504  return ST.has16BitInsts() &&
2506  ST.hasInv2PiInlineImm());
2507  default:
2508  llvm_unreachable("invalid bitwidth");
2509  }
2510 }
2511 
2513  uint8_t OperandType) const {
2514  if (!MO.isImm() ||
2515  OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2516  OperandType > AMDGPU::OPERAND_SRC_LAST)
2517  return false;
2518 
2519  // MachineOperand provides no way to tell the true operand size, since it only
2520  // records a 64-bit value. We need to know the size to determine if a 32-bit
2521  // floating point immediate bit pattern is legal for an integer immediate. It
2522  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2523 
2524  int64_t Imm = MO.getImm();
2525  switch (OperandType) {
2530  int32_t Trunc = static_cast<int32_t>(Imm);
2532  }
2538  ST.hasInv2PiInlineImm());
2543  if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2544  // A few special case instructions have 16-bit operands on subtargets
2545  // where 16-bit instructions are not legal.
2546  // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2547  // constants in these cases
2548  int16_t Trunc = static_cast<int16_t>(Imm);
2549  return ST.has16BitInsts() &&
2551  }
2552 
2553  return false;
2554  }
2557  if (isUInt<16>(Imm)) {
2558  int16_t Trunc = static_cast<int16_t>(Imm);
2559  return ST.has16BitInsts() &&
2561  }
2562  if (!(Imm & 0xffff)) {
2563  return ST.has16BitInsts() &&
2565  }
2566  uint32_t Trunc = static_cast<uint32_t>(Imm);
2568  }
2569  default:
2570  llvm_unreachable("invalid bitwidth");
2571  }
2572 }
2573 
2575  const MCOperandInfo &OpInfo) const {
2576  switch (MO.getType()) {
2578  return false;
2580  return !isInlineConstant(MO, OpInfo);
2586  return true;
2587  default:
2588  llvm_unreachable("unexpected operand type");
2589  }
2590 }
2591 
2592 static bool compareMachineOp(const MachineOperand &Op0,
2593  const MachineOperand &Op1) {
2594  if (Op0.getType() != Op1.getType())
2595  return false;
2596 
2597  switch (Op0.getType()) {
2599  return Op0.getReg() == Op1.getReg();
2601  return Op0.getImm() == Op1.getImm();
2602  default:
2603  llvm_unreachable("Didn't expect to be comparing these operand types");
2604  }
2605 }
2606 
2608  const MachineOperand &MO) const {
2609  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2610 
2611  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2612 
2613  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2614  return true;
2615 
2616  if (OpInfo.RegClass < 0)
2617  return false;
2618 
2619  if (MO.isImm() && isInlineConstant(MO, OpInfo))
2620  return RI.opCanUseInlineConstant(OpInfo.OperandType);
2621 
2622  return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2623 }
2624 
2625 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2626  int Op32 = AMDGPU::getVOPe32(Opcode);
2627  if (Op32 == -1)
2628  return false;
2629 
2630  return pseudoToMCOpcode(Op32) != -1;
2631 }
2632 
2633 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2634  // The src0_modifier operand is present on all instructions
2635  // that have modifiers.
2636 
2637  return AMDGPU::getNamedOperandIdx(Opcode,
2638  AMDGPU::OpName::src0_modifiers) != -1;
2639 }
2640 
2642  unsigned OpName) const {
2643  const MachineOperand *Mods = getNamedOperand(MI, OpName);
2644  return Mods && Mods->getImm();
2645 }
2646 
2648  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2649  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2650  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2651  hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2652  hasModifiersSet(MI, AMDGPU::OpName::omod);
2653 }
2654 
2656  const MachineRegisterInfo &MRI) const {
2657  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2658  // Can't shrink instruction with three operands.
2659  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
2660  // a special case for it. It can only be shrunk if the third operand
2661  // is vcc, and src0_modifiers and src1_modifiers are not set.
2662  // We should handle this the same way we handle vopc, by addding
2663  // a register allocation hint pre-regalloc and then do the shrinking
2664  // post-regalloc.
2665  if (Src2) {
2666  switch (MI.getOpcode()) {
2667  default: return false;
2668 
2669  case AMDGPU::V_ADDC_U32_e64:
2670  case AMDGPU::V_SUBB_U32_e64:
2671  case AMDGPU::V_SUBBREV_U32_e64: {
2672  const MachineOperand *Src1
2673  = getNamedOperand(MI, AMDGPU::OpName::src1);
2674  if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
2675  return false;
2676  // Additional verification is needed for sdst/src2.
2677  return true;
2678  }
2679  case AMDGPU::V_MAC_F32_e64:
2680  case AMDGPU::V_MAC_F16_e64:
2681  case AMDGPU::V_FMAC_F32_e64:
2682  if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
2683  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
2684  return false;
2685  break;
2686 
2687  case AMDGPU::V_CNDMASK_B32_e64:
2688  break;
2689  }
2690  }
2691 
2692  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2693  if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
2694  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
2695  return false;
2696 
2697  // We don't need to check src0, all input types are legal, so just make sure
2698  // src0 isn't using any modifiers.
2699  if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
2700  return false;
2701 
2702  // Can it be shrunk to a valid 32 bit opcode?
2703  if (!hasVALU32BitEncoding(MI.getOpcode()))
2704  return false;
2705 
2706  // Check output modifiers
2707  return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
2708  !hasModifiersSet(MI, AMDGPU::OpName::clamp);
2709 }
2710 
2711 // Set VCC operand with all flags from \p Orig, except for setting it as
2712 // implicit.
2714  const MachineOperand &Orig) {
2715 
2716  for (MachineOperand &Use : MI.implicit_operands()) {
2717  if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
2718  Use.setIsUndef(Orig.isUndef());
2719  Use.setIsKill(Orig.isKill());
2720  return;
2721  }
2722  }
2723 }
2724 
2726  unsigned Op32) const {
2727  MachineBasicBlock *MBB = MI.getParent();;
2728  MachineInstrBuilder Inst32 =
2729  BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
2730 
2731  // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
2732  // For VOPC instructions, this is replaced by an implicit def of vcc.
2733  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
2734  if (Op32DstIdx != -1) {
2735  // dst
2736  Inst32.add(MI.getOperand(0));
2737  } else {
2738  assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
2739  "Unexpected case");
2740  }
2741 
2742  Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
2743 
2744  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2745  if (Src1)
2746  Inst32.add(*Src1);
2747 
2748  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2749 
2750  if (Src2) {
2751  int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
2752  if (Op32Src2Idx != -1) {
2753  Inst32.add(*Src2);
2754  } else {
2755  // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
2756  // replaced with an implicit read of vcc. This was already added
2757  // during the initial BuildMI, so find it to preserve the flags.
2758  copyFlagsToImplicitVCC(*Inst32, *Src2);
2759  }
2760  }
2761 
2762  return Inst32;
2763 }
2764 
2766  const MachineOperand &MO,
2767  const MCOperandInfo &OpInfo) const {
2768  // Literal constants use the constant bus.
2769  //if (isLiteralConstantLike(MO, OpInfo))
2770  // return true;
2771  if (MO.isImm())
2772  return !isInlineConstant(MO, OpInfo);
2773 
2774  if (!MO.isReg())
2775  return true; // Misc other operands like FrameIndex
2776 
2777  if (!MO.isUse())
2778  return false;
2779 
2781  return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2782 
2783  // FLAT_SCR is just an SGPR pair.
2784  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2785  return true;
2786 
2787  // EXEC register uses the constant bus.
2788  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2789  return true;
2790 
2791  // SGPRs use the constant bus
2792  return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2793  (!MO.isImplicit() &&
2794  (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2795  AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2796 }
2797 
2798 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2799  for (const MachineOperand &MO : MI.implicit_operands()) {
2800  // We only care about reads.
2801  if (MO.isDef())
2802  continue;
2803 
2804  switch (MO.getReg()) {
2805  case AMDGPU::VCC:
2806  case AMDGPU::M0:
2807  case AMDGPU::FLAT_SCR:
2808  return MO.getReg();
2809 
2810  default:
2811  break;
2812  }
2813  }
2814 
2815  return AMDGPU::NoRegister;
2816 }
2817 
2818 static bool shouldReadExec(const MachineInstr &MI) {
2819  if (SIInstrInfo::isVALU(MI)) {
2820  switch (MI.getOpcode()) {
2821  case AMDGPU::V_READLANE_B32:
2822  case AMDGPU::V_READLANE_B32_si:
2823  case AMDGPU::V_READLANE_B32_vi:
2824  case AMDGPU::V_WRITELANE_B32:
2825  case AMDGPU::V_WRITELANE_B32_si:
2826  case AMDGPU::V_WRITELANE_B32_vi:
2827  return false;
2828  }
2829 
2830  return true;
2831  }
2832 
2833  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2834  SIInstrInfo::isSALU(MI) ||
2835  SIInstrInfo::isSMRD(MI))
2836  return false;
2837 
2838  return true;
2839 }
2840 
2841 static bool isSubRegOf(const SIRegisterInfo &TRI,
2842  const MachineOperand &SuperVec,
2843  const MachineOperand &SubReg) {
2845  return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2846 
2847  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2848  SubReg.getReg() == SuperVec.getReg();
2849 }
2850 
2852  StringRef &ErrInfo) const {
2853  uint16_t Opcode = MI.getOpcode();
2854  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2855  return true;
2856 
2857  const MachineFunction *MF = MI.getParent()->getParent();
2858  const MachineRegisterInfo &MRI = MF->getRegInfo();
2859 
2860  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2861  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2862  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2863 
2864  // Make sure the number of operands is correct.
2865  const MCInstrDesc &Desc = get(Opcode);
2866  if (!Desc.isVariadic() &&
2867  Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2868  ErrInfo = "Instruction has wrong number of operands.";
2869  return false;
2870  }
2871 
2872  if (MI.isInlineAsm()) {
2873  // Verify register classes for inlineasm constraints.
2874  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2875  I != E; ++I) {
2876  const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2877  if (!RC)
2878  continue;
2879 
2880  const MachineOperand &Op = MI.getOperand(I);
2881  if (!Op.isReg())
2882  continue;
2883 
2884  unsigned Reg = Op.getReg();
2885  if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2886  ErrInfo = "inlineasm operand has incorrect register class.";
2887  return false;
2888  }
2889  }
2890 
2891  return true;
2892  }
2893 
2894  // Make sure the register classes are correct.
2895  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2896  if (MI.getOperand(i).isFPImm()) {
2897  ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2898  "all fp values to integers.";
2899  return false;
2900  }
2901 
2902  int RegClass = Desc.OpInfo[i].RegClass;
2903 
2904  switch (Desc.OpInfo[i].OperandType) {
2906  if (MI.getOperand(i).isImm()) {
2907  ErrInfo = "Illegal immediate value for operand.";
2908  return false;
2909  }
2910  break;
2913  break;
2920  const MachineOperand &MO = MI.getOperand(i);
2921  if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2922  ErrInfo = "Illegal immediate value for operand.";
2923  return false;
2924  }
2925  break;
2926  }
2929  // Check if this operand is an immediate.
2930  // FrameIndex operands will be replaced by immediates, so they are
2931  // allowed.
2932  if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2933  ErrInfo = "Expected immediate, but got non-immediate";
2934  return false;
2935  }
2937  default:
2938  continue;
2939  }
2940 
2941  if (!MI.getOperand(i).isReg())
2942  continue;
2943 
2944  if (RegClass != -1) {
2945  unsigned Reg = MI.getOperand(i).getReg();
2946  if (Reg == AMDGPU::NoRegister ||
2948  continue;
2949 
2950  const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2951  if (!RC->contains(Reg)) {
2952  ErrInfo = "Operand has incorrect register class.";
2953  return false;
2954  }
2955  }
2956  }
2957 
2958  // Verify SDWA
2959  if (isSDWA(MI)) {
2960  if (!ST.hasSDWA()) {
2961  ErrInfo = "SDWA is not supported on this target";
2962  return false;
2963  }
2964 
2965  int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2966 
2967  const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2968 
2969  for (int OpIdx: OpIndicies) {
2970  if (OpIdx == -1)
2971  continue;
2972  const MachineOperand &MO = MI.getOperand(OpIdx);
2973 
2974  if (!ST.hasSDWAScalar()) {
2975  // Only VGPRS on VI
2976  if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
2977  ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2978  return false;
2979  }
2980  } else {
2981  // No immediates on GFX9
2982  if (!MO.isReg()) {
2983  ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2984  return false;
2985  }
2986  }
2987  }
2988 
2989  if (!ST.hasSDWAOmod()) {
2990  // No omod allowed on VI
2991  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2992  if (OMod != nullptr &&
2993  (!OMod->isImm() || OMod->getImm() != 0)) {
2994  ErrInfo = "OMod not allowed in SDWA instructions on VI";
2995  return false;
2996  }
2997  }
2998 
2999  uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
3000  if (isVOPC(BasicOpcode)) {
3001  if (!ST.hasSDWASdst() && DstIdx != -1) {
3002  // Only vcc allowed as dst on VI for VOPC
3003  const MachineOperand &Dst = MI.getOperand(DstIdx);
3004  if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
3005  ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
3006  return false;
3007  }
3008  } else if (!ST.hasSDWAOutModsVOPC()) {
3009  // No clamp allowed on GFX9 for VOPC
3010  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3011  if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
3012  ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
3013  return false;
3014  }
3015 
3016  // No omod allowed on GFX9 for VOPC
3017  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
3018  if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
3019  ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
3020  return false;
3021  }
3022  }
3023  }
3024 
3025  const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
3026  if (DstUnused && DstUnused->isImm() &&
3027  DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
3028  const MachineOperand &Dst = MI.getOperand(DstIdx);
3029  if (!Dst.isReg() || !Dst.isTied()) {
3030  ErrInfo = "Dst register should have tied register";
3031  return false;
3032  }
3033 
3034  const MachineOperand &TiedMO =
3035  MI.getOperand(MI.findTiedOperandIdx(DstIdx));
3036  if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
3037  ErrInfo =
3038  "Dst register should be tied to implicit use of preserved register";
3039  return false;
3040  } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
3041  Dst.getReg() != TiedMO.getReg()) {
3042  ErrInfo = "Dst register should use same physical register as preserved";
3043  return false;
3044  }
3045  }
3046  }
3047 
3048  // Verify MIMG
3049  if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
3050  // Ensure that the return type used is large enough for all the options
3051  // being used TFE/LWE require an extra result register.
3052  const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
3053  if (DMask) {
3054  uint64_t DMaskImm = DMask->getImm();
3055  uint32_t RegCount =
3056  isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
3057  const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
3058  const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
3059  const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
3060 
3061  // Adjust for packed 16 bit values
3062  if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
3063  RegCount >>= 1;
3064 
3065  // Adjust if using LWE or TFE
3066  if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
3067  RegCount += 1;
3068 
3069  const uint32_t DstIdx =
3070  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
3071  const MachineOperand &Dst = MI.getOperand(DstIdx);
3072  if (Dst.isReg()) {
3073  const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
3074  uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
3075  if (RegCount > DstSize) {
3076  ErrInfo = "MIMG instruction returns too many registers for dst "
3077  "register class";
3078  return false;
3079  }
3080  }
3081  }
3082  }
3083 
3084  // Verify VOP*. Ignore multiple sgpr operands on writelane.
3085  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
3086  && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
3087  // Only look at the true operands. Only a real operand can use the constant
3088  // bus, and we don't want to check pseudo-operands like the source modifier
3089  // flags.
3090  const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
3091 
3092  unsigned ConstantBusCount = 0;
3093  unsigned LiteralCount = 0;
3094 
3095  if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
3096  ++ConstantBusCount;
3097 
3098  unsigned SGPRUsed = findImplicitSGPRRead(MI);
3099  if (SGPRUsed != AMDGPU::NoRegister)
3100  ++ConstantBusCount;
3101 
3102  for (int OpIdx : OpIndices) {
3103  if (OpIdx == -1)
3104  break;
3105  const MachineOperand &MO = MI.getOperand(OpIdx);
3106  if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
3107  if (MO.isReg()) {
3108  if (MO.getReg() != SGPRUsed)
3109  ++ConstantBusCount;
3110  SGPRUsed = MO.getReg();
3111  } else {
3112  ++ConstantBusCount;
3113  ++LiteralCount;
3114  }
3115  }
3116  }
3117  if (ConstantBusCount > 1) {
3118  ErrInfo = "VOP* instruction uses the constant bus more than once";
3119  return false;
3120  }
3121 
3122  if (isVOP3(MI) && LiteralCount) {
3123  ErrInfo = "VOP3 instruction uses literal";
3124  return false;
3125  }
3126  }
3127 
3128  // Verify misc. restrictions on specific instructions.
3129  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
3130  Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
3131  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3132  const MachineOperand &Src1 = MI.getOperand(Src1Idx);
3133  const MachineOperand &Src2 = MI.getOperand(Src2Idx);
3134  if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
3135  if (!compareMachineOp(Src0, Src1) &&
3136  !compareMachineOp(Src0, Src2)) {
3137  ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
3138  return false;
3139  }
3140  }
3141  }
3142 
3143  if (isSOPK(MI)) {
3144  int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
3145  if (sopkIsZext(MI)) {
3146  if (!isUInt<16>(Imm)) {
3147  ErrInfo = "invalid immediate for SOPK instruction";
3148  return false;
3149  }
3150  } else {
3151  if (!isInt<16>(Imm)) {
3152  ErrInfo = "invalid immediate for SOPK instruction";
3153  return false;
3154  }
3155  }
3156  }
3157 
3158  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
3159  Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
3160  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3161  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
3162  const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3163  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
3164 
3165  const unsigned StaticNumOps = Desc.getNumOperands() +
3166  Desc.getNumImplicitUses();
3167  const unsigned NumImplicitOps = IsDst ? 2 : 1;
3168 
3169  // Allow additional implicit operands. This allows a fixup done by the post
3170  // RA scheduler where the main implicit operand is killed and implicit-defs
3171  // are added for sub-registers that remain live after this instruction.
3172  if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
3173  ErrInfo = "missing implicit register operands";
3174  return false;
3175  }
3176 
3177  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3178  if (IsDst) {
3179  if (!Dst->isUse()) {
3180  ErrInfo = "v_movreld_b32 vdst should be a use operand";
3181  return false;
3182  }
3183 
3184  unsigned UseOpIdx;
3185  if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
3186  UseOpIdx != StaticNumOps + 1) {
3187  ErrInfo = "movrel implicit operands should be tied";
3188  return false;
3189  }
3190  }
3191 
3192  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3193  const MachineOperand &ImpUse
3194  = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
3195  if (!ImpUse.isReg() || !ImpUse.isUse() ||
3196  !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
3197  ErrInfo = "src0 should be subreg of implicit vector use";
3198  return false;
3199  }
3200  }
3201 
3202  // Make sure we aren't losing exec uses in the td files. This mostly requires
3203  // being careful when using let Uses to try to add other use registers.
3204  if (shouldReadExec(MI)) {
3205  if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
3206  ErrInfo = "VALU instruction does not implicitly read exec mask";
3207  return false;
3208  }
3209  }
3210 
3211  if (isSMRD(MI)) {
3212  if (MI.mayStore()) {
3213  // The register offset form of scalar stores may only use m0 as the
3214  // soffset register.
3215  const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
3216  if (Soff && Soff->getReg() != AMDGPU::M0) {
3217  ErrInfo = "scalar stores must use m0 as offset register";
3218  return false;
3219  }
3220  }
3221  }
3222 
3223  if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
3224  const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3225  if (Offset->getImm() != 0) {
3226  ErrInfo = "subtarget does not support offsets in flat instructions";
3227  return false;
3228  }
3229  }
3230 
3231  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
3232  if (DppCt) {
3233  using namespace AMDGPU::DPP;
3234 
3235  unsigned DC = DppCt->getImm();
3236  if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
3237  DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
3242  ErrInfo = "Invalid dpp_ctrl value";
3243  return false;
3244  }
3245  }
3246 
3247  return true;
3248 }
3249 
3250 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
3251  switch (MI.getOpcode()) {
3252  default: return AMDGPU::INSTRUCTION_LIST_END;
3253  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
3254  case AMDGPU::COPY: return AMDGPU::COPY;
3255  case AMDGPU::PHI: return AMDGPU::PHI;
3256  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
3257  case AMDGPU::WQM: return AMDGPU::WQM;
3258  case AMDGPU::WWM: return AMDGPU::WWM;
3259  case AMDGPU::S_MOV_B32:
3260  return MI.getOperand(1).isReg() ?
3261  AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
3262  case AMDGPU::S_ADD_I32:
3263  return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
3264  case AMDGPU::S_ADDC_U32:
3265  return AMDGPU::V_ADDC_U32_e32;
3266  case AMDGPU::S_SUB_I32:
3267  return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
3268  // FIXME: These are not consistently handled, and selected when the carry is
3269  // used.
3270  case AMDGPU::S_ADD_U32:
3271  return AMDGPU::V_ADD_I32_e32;
3272  case AMDGPU::S_SUB_U32:
3273  return AMDGPU::V_SUB_I32_e32;
3274  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
3275  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
3276  case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32;
3277  case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32;
3278  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
3279  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
3280  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
3281  case AMDGPU::S_XNOR_B32:
3282  return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
3283  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
3284  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
3285  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
3286  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
3287  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
3288  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
3289  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
3290  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
3291  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
3292  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
3293  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
3294  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
3295  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
3296  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
3297  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
3298  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
3299  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
3300  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
3301  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
3302  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
3303  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
3304  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
3305  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
3306  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
3307  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
3308  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
3309  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
3310  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
3311  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
3312  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
3313  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
3314  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
3315  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
3316  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
3317  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
3318  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
3319  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
3320  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
3321  }
3323  "Unexpected scalar opcode without corresponding vector one!");
3324 }
3325 
3327  unsigned OpNo) const {
3329  const MCInstrDesc &Desc = get(MI.getOpcode());
3330  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
3331  Desc.OpInfo[OpNo].RegClass == -1) {
3332  unsigned Reg = MI.getOperand(OpNo).getReg();
3333 
3335  return MRI.getRegClass(Reg);
3336  return RI.getPhysRegClass(Reg);
3337  }
3338 
3339  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
3340  return RI.getRegClass(RCID);
3341 }
3342 
3343 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
3345  MachineBasicBlock *MBB = MI.getParent();
3346  MachineOperand &MO = MI.getOperand(OpIdx);
3348  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
3349  const TargetRegisterClass *RC = RI.getRegClass(RCID);
3350  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
3351  if (MO.isReg())
3352  Opcode = AMDGPU::COPY;
3353  else if (RI.isSGPRClass(RC))
3354  Opcode = AMDGPU::S_MOV_B32;
3355 
3356  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
3357  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
3358  VRC = &AMDGPU::VReg_64RegClass;
3359  else
3360  VRC = &AMDGPU::VGPR_32RegClass;
3361 
3362  unsigned Reg = MRI.createVirtualRegister(VRC);
3363  DebugLoc DL = MBB->findDebugLoc(I);
3364  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
3365  MO.ChangeToRegister(Reg, false);
3366 }
3367 
3370  MachineOperand &SuperReg,
3371  const TargetRegisterClass *SuperRC,
3372  unsigned SubIdx,
3373  const TargetRegisterClass *SubRC)
3374  const {
3375  MachineBasicBlock *MBB = MI->getParent();
3376  DebugLoc DL = MI->getDebugLoc();
3377  unsigned SubReg = MRI.createVirtualRegister(SubRC);
3378 
3379  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
3380  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3381  .addReg(SuperReg.getReg(), 0, SubIdx);
3382  return SubReg;
3383  }
3384 
3385  // Just in case the super register is itself a sub-register, copy it to a new
3386  // value so we don't need to worry about merging its subreg index with the
3387  // SubIdx passed to this function. The register coalescer should be able to
3388  // eliminate this extra copy.
3389  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
3390 
3391  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
3392  .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
3393 
3394  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3395  .addReg(NewSuperReg, 0, SubIdx);
3396 
3397  return SubReg;
3398 }
3399 
3403  MachineOperand &Op,
3404  const TargetRegisterClass *SuperRC,
3405  unsigned SubIdx,
3406  const TargetRegisterClass *SubRC) const {
3407  if (Op.isImm()) {
3408  if (SubIdx == AMDGPU::sub0)
3409  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
3410  if (SubIdx == AMDGPU::sub1)
3411  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
3412 
3413  llvm_unreachable("Unhandled register index for immediate");
3414  }
3415 
3416  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
3417  SubIdx, SubRC);
3418  return MachineOperand::CreateReg(SubReg, false);
3419 }
3420 
3421 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
3422 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
3423  assert(Inst.getNumExplicitOperands() == 3);
3424  MachineOperand Op1 = Inst.getOperand(1);
3425  Inst.RemoveOperand(1);
3426  Inst.addOperand(Op1);
3427 }
3428 
3430  const MCOperandInfo &OpInfo,
3431  const MachineOperand &MO) const {
3432  if (!MO.isReg())
3433  return false;
3434 
3435  unsigned Reg = MO.getReg();
3436  const TargetRegisterClass *RC =
3438  MRI.getRegClass(Reg) :
3439  RI.getPhysRegClass(Reg);
3440 
3441  const SIRegisterInfo *TRI =
3442  static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3443  RC = TRI->getSubRegClass(RC, MO.getSubReg());
3444 
3445  // In order to be legal, the common sub-class must be equal to the
3446  // class of the current operand. For example:
3447  //
3448  // v_mov_b32 s0 ; Operand defined as vsrc_b32
3449  // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
3450  //
3451  // s_sendmsg 0, s0 ; Operand defined as m0reg
3452  // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
3453 
3454  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
3455 }
3456 
3458  const MCOperandInfo &OpInfo,
3459  const MachineOperand &MO) const {
3460  if (MO.isReg())
3461  return isLegalRegOperand(MRI, OpInfo, MO);
3462 
3463  // Handle non-register types that are treated like immediates.
3464  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
3465  return true;
3466 }
3467 
3468 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
3469  const MachineOperand *MO) const {
3471  const MCInstrDesc &InstDesc = MI.getDesc();
3472  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
3473  const TargetRegisterClass *DefinedRC =
3474  OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
3475  if (!MO)
3476  MO = &MI.getOperand(OpIdx);
3477 
3478  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
3479 
3480  RegSubRegPair SGPRUsed;
3481  if (MO->isReg())
3482  SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
3483 
3484  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3485  if (i == OpIdx)
3486  continue;
3487  const MachineOperand &Op = MI.getOperand(i);
3488  if (Op.isReg()) {
3489  if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
3490  usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
3491  return false;
3492  }
3493  } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
3494  return false;
3495  }
3496  }
3497  }
3498 
3499  if (MO->isReg()) {
3500  assert(DefinedRC);
3501  return isLegalRegOperand(MRI, OpInfo, *MO);
3502  }
3503 
3504  // Handle non-register types that are treated like immediates.
3505  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
3506 
3507  if (!DefinedRC) {
3508  // This operand expects an immediate.
3509  return true;
3510  }
3511 
3512  return isImmOperandLegal(MI, OpIdx, *MO);
3513 }
3514 
3516  MachineInstr &MI) const {
3517  unsigned Opc = MI.getOpcode();
3518  const MCInstrDesc &InstrDesc = get(Opc);
3519 
3520  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3521  MachineOperand &Src1 = MI.getOperand(Src1Idx);
3522 
3523  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
3524  // we need to only have one constant bus use.
3525  //
3526  // Note we do not need to worry about literal constants here. They are
3527  // disabled for the operand type for instructions because they will always
3528  // violate the one constant bus use rule.
3529  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3530  if (HasImplicitSGPR) {
3531  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3532  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3533 
3534  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
3535  legalizeOpWithMove(MI, Src0Idx);
3536  }
3537 
3538  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
3539  // both the value to write (src0) and lane select (src1). Fix up non-SGPR
3540  // src0/src1 with V_READFIRSTLANE.
3541  if (Opc == AMDGPU::V_WRITELANE_B32) {
3542  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3543  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3544  const DebugLoc &DL = MI.getDebugLoc();
3545  if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
3546  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3547  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3548  .add(Src0);
3549  Src0.ChangeToRegister(Reg, false);
3550  }
3551  if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
3552  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3553  const DebugLoc &DL = MI.getDebugLoc();
3554  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3555  .add(Src1);
3556  Src1.ChangeToRegister(Reg, false);
3557  }
3558  return;
3559  }
3560 
3561  // VOP2 src0 instructions support all operand types, so we don't need to check
3562  // their legality. If src1 is already legal, we don't need to do anything.
3563  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3564  return;
3565 
3566  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3567  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3568  // select is uniform.
3569  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
3570  RI.isVGPR(MRI, Src1.getReg())) {
3571  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3572  const DebugLoc &DL = MI.getDebugLoc();
3573  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3574  .add(Src1);
3575  Src1.ChangeToRegister(Reg, false);
3576  return;
3577  }
3578 
3579  // We do not use commuteInstruction here because it is too aggressive and will
3580  // commute if it is possible. We only want to commute here if it improves
3581  // legality. This can be called a fairly large number of times so don't waste
3582  // compile time pointlessly swapping and checking legality again.
3583  if (HasImplicitSGPR || !MI.isCommutable()) {
3584  legalizeOpWithMove(MI, Src1Idx);
3585  return;
3586  }
3587 
3588  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3589  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3590 
3591  // If src0 can be used as src1, commuting will make the operands legal.
3592  // Otherwise we have to give up and insert a move.
3593  //
3594  // TODO: Other immediate-like operand kinds could be commuted if there was a
3595  // MachineOperand::ChangeTo* for them.
3596  if ((!Src1.isImm() && !Src1.isReg()) ||
3597  !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
3598  legalizeOpWithMove(MI, Src1Idx);
3599  return;
3600  }
3601 
3602  int CommutedOpc = commuteOpcode(MI);
3603  if (CommutedOpc == -1) {
3604  legalizeOpWithMove(MI, Src1Idx);
3605  return;
3606  }
3607 
3608  MI.setDesc(get(CommutedOpc));
3609 
3610  unsigned Src0Reg = Src0.getReg();
3611  unsigned Src0SubReg = Src0.getSubReg();
3612  bool Src0Kill = Src0.isKill();
3613 
3614  if (Src1.isImm())
3615  Src0.ChangeToImmediate(Src1.getImm());
3616  else if (Src1.isReg()) {
3617  Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
3618  Src0.setSubReg(Src1.getSubReg());
3619  } else
3620  llvm_unreachable("Should only have register or immediate operands");
3621 
3622  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3623  Src1.setSubReg(Src0SubReg);
3624 }
3625 
3626 // Legalize VOP3 operands. Because all operand types are supported for any
3627 // operand, and since literal constants are not allowed and should never be
3628 // seen, we only need to worry about inserting copies if we use multiple SGPR
3629 // operands.
3631  MachineInstr &MI) const {
3632  unsigned Opc = MI.getOpcode();
3633 
3634  int VOP3Idx[3] = {
3635  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3636  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3637  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3638  };
3639 
3640  // Find the one SGPR operand we are allowed to use.
3641  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3642 
3643  for (unsigned i = 0; i < 3; ++i) {
3644  int Idx = VOP3Idx[i];
3645  if (Idx == -1)
3646  break;
3647  MachineOperand &MO = MI.getOperand(Idx);
3648 
3649  // We should never see a VOP3 instruction with an illegal immediate operand.
3650  if (!MO.isReg())
3651  continue;
3652 
3653  if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
3654  continue; // VGPRs are legal
3655 
3656  if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
3657  SGPRReg = MO.getReg();
3658  // We can use one SGPR in each VOP3 instruction.
3659  continue;
3660  }
3661 
3662  // If we make it this far, then the operand is not legal and we must
3663  // legalize it.
3664  legalizeOpWithMove(MI, Idx);
3665  }
3666 }
3667 
3669  MachineRegisterInfo &MRI) const {
3670  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3671  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3672  unsigned DstReg = MRI.createVirtualRegister(SRC);
3673  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3674 
3675  if (SubRegs == 1) {
3676  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3677  get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
3678  .addReg(SrcReg);
3679  return DstReg;
3680  }
3681 
3683  for (unsigned i = 0; i < SubRegs; ++i) {
3684  unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3685  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3686  get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3687  .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3688  SRegs.push_back(SGPR);
3689  }
3690 
3691  MachineInstrBuilder MIB =
3692  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3693  get(AMDGPU::REG_SEQUENCE), DstReg);
3694  for (unsigned i = 0; i < SubRegs; ++i) {
3695  MIB.addReg(SRegs[i]);
3696  MIB.addImm(RI.getSubRegFromChannel(i));
3697  }
3698  return DstReg;
3699 }
3700 
3702  MachineInstr &MI) const {
3703 
3704  // If the pointer is store in VGPRs, then we need to move them to
3705  // SGPRs using v_readfirstlane. This is safe because we only select
3706  // loads with uniform pointers to SMRD instruction so we know the
3707  // pointer value is uniform.
3708  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3709  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3710  unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3711  SBase->setReg(SGPR);
3712  }
3713  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
3714  if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
3715  unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
3716  SOff->setReg(SGPR);
3717  }
3718 }
3719 
3722  const TargetRegisterClass *DstRC,
3723  MachineOperand &Op,
3725  const DebugLoc &DL) const {
3726  unsigned OpReg = Op.getReg();
3727  unsigned OpSubReg = Op.getSubReg();
3728 
3729  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3730  RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3731 
3732  // Check if operand is already the correct register class.
3733  if (DstRC == OpRC)
3734  return;
3735 
3736  unsigned DstReg = MRI.createVirtualRegister(DstRC);
3737  MachineInstr *Copy =
3738  BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3739 
3740  Op.setReg(DstReg);
3741  Op.setSubReg(0);
3742 
3743  MachineInstr *Def = MRI.getVRegDef(OpReg);
3744  if (!Def)
3745  return;
3746 
3747  // Try to eliminate the copy if it is copying an immediate value.
3748  if (Def->isMoveImmediate())
3749  FoldImmediate(*Copy, *Def, OpReg, &MRI);
3750 }
3751 
3752 // Emit the actual waterfall loop, executing the wrapped instruction for each
3753 // unique value of \p Rsrc across all lanes. In the best case we execute 1
3754 // iteration, in the worst case we execute 64 (once per lane).
3755 static void
3757  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3758  const DebugLoc &DL, MachineOperand &Rsrc) {
3759  MachineBasicBlock::iterator I = LoopBB.begin();
3760 
3761  unsigned VRsrc = Rsrc.getReg();
3762  unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
3763 
3764  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3765  unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3766  unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3767  unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3768  unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3769  unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3770  unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3771  unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3772  unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3773 
3774  // Beginning of the loop, read the next Rsrc variant.
3775  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
3776  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
3777  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
3778  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
3779  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
3780  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
3781  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
3782  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
3783 
3784  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
3785  .addReg(SRsrcSub0)
3786  .addImm(AMDGPU::sub0)
3787  .addReg(SRsrcSub1)
3788  .addImm(AMDGPU::sub1)
3789  .addReg(SRsrcSub2)
3790  .addImm(AMDGPU::sub2)
3791  .addReg(SRsrcSub3)
3792  .addImm(AMDGPU::sub3);
3793 
3794  // Update Rsrc operand to use the SGPR Rsrc.
3795  Rsrc.setReg(SRsrc);
3796  Rsrc.setIsKill(true);
3797 
3798  // Identify all lanes with identical Rsrc operands in their VGPRs.
3799  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
3800  .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
3801  .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
3802  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
3803  .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
3804  .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
3805  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
3806  .addReg(CondReg0)
3807  .addReg(CondReg1);
3808 
3809  MRI.setSimpleHint(SaveExec, AndCond);
3810 
3811  // Update EXEC to matching lanes, saving original to SaveExec.
3812  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
3813  .addReg(AndCond, RegState::Kill);
3814 
3815  // The original instruction is here; we insert the terminators after it.
3816  I = LoopBB.end();
3817 
3818  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3819  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
3820  .addReg(AMDGPU::EXEC)
3821  .addReg(SaveExec);
3822  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
3823 }
3824 
3825 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
3826 // with SGPRs by iterating over all unique values across all lanes.
3828  MachineOperand &Rsrc, MachineDominatorTree *MDT) {
3829  MachineBasicBlock &MBB = *MI.getParent();
3830  MachineFunction &MF = *MBB.getParent();
3833  const DebugLoc &DL = MI.getDebugLoc();
3834 
3835  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3836 
3837  // Save the EXEC mask
3838  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
3839  .addReg(AMDGPU::EXEC);
3840 
3841  // Killed uses in the instruction we are waterfalling around will be
3842  // incorrect due to the added control-flow.
3843  for (auto &MO : MI.uses()) {
3844  if (MO.isReg() && MO.isUse()) {
3845  MRI.clearKillFlags(MO.getReg());
3846  }
3847  }
3848 
3849  // To insert the loop we need to split the block. Move everything after this
3850  // point to a new block, and insert a new empty block between the two.
3852  MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
3853  MachineFunction::iterator MBBI(MBB);
3854  ++MBBI;
3855 
3856  MF.insert(MBBI, LoopBB);
3857  MF.insert(MBBI, RemainderBB);
3858 
3859  LoopBB->addSuccessor(LoopBB);
3860  LoopBB->addSuccessor(RemainderBB);
3861 
3862  // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
3864  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3865  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3866  LoopBB->splice(LoopBB->begin(), &MBB, J);
3867 
3868  MBB.addSuccessor(LoopBB);
3869 
3870  // Update dominators. We know that MBB immediately dominates LoopBB, that
3871  // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
3872  // dominates all of the successors transferred to it from MBB that MBB used
3873  // to dominate.
3874  if (MDT) {
3875  MDT->addNewBlock(LoopBB, &MBB);
3876  MDT->addNewBlock(RemainderBB, LoopBB);
3877  for (auto &Succ : RemainderBB->successors()) {
3878  if (MDT->dominates(&MBB, Succ)) {
3879  MDT->changeImmediateDominator(Succ, RemainderBB);
3880  }
3881  }
3882  }
3883 
3884  emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
3885 
3886  // Restore the EXEC mask
3887  MachineBasicBlock::iterator First = RemainderBB->begin();
3888  BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3889  .addReg(SaveExec);
3890 }
3891 
3892 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
3893 static std::tuple<unsigned, unsigned>
3895  MachineBasicBlock &MBB = *MI.getParent();
3896  MachineFunction &MF = *MBB.getParent();
3898 
3899  // Extract the ptr from the resource descriptor.
3900  unsigned RsrcPtr =
3901  TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
3902  AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3903 
3904  // Create an empty resource descriptor
3905  unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3906  unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3907  unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3908  unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3909  uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
3910 
3911  // Zero64 = 0
3912  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
3913  .addImm(0);
3914 
3915  // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3916  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3917  .addImm(RsrcDataFormat & 0xFFFFFFFF);
3918 
3919  // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3920  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3921  .addImm(RsrcDataFormat >> 32);
3922 
3923  // NewSRsrc = {Zero64, SRsrcFormat}
3924  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3925  .addReg(Zero64)
3926  .addImm(AMDGPU::sub0_sub1)
3927  .addReg(SRsrcFormatLo)
3928  .addImm(AMDGPU::sub2)
3929  .addReg(SRsrcFormatHi)
3930  .addImm(AMDGPU::sub3);
3931 
3932  return std::make_tuple(RsrcPtr, NewSRsrc);
3933 }
3934 
3936  MachineDominatorTree *MDT) const {
3937  MachineFunction &MF = *MI.getParent()->getParent();
3939 
3940  // Legalize VOP2
3941  if (isVOP2(MI) || isVOPC(MI)) {
3942  legalizeOperandsVOP2(MRI, MI);
3943  return;
3944  }
3945 
3946  // Legalize VOP3
3947  if (isVOP3(MI)) {
3948  legalizeOperandsVOP3(MRI, MI);
3949  return;
3950  }
3951 
3952  // Legalize SMRD
3953  if (isSMRD(MI)) {
3954  legalizeOperandsSMRD(MRI, MI);
3955  return;
3956  }
3957 
3958  // Legalize REG_SEQUENCE and PHI
3959  // The register class of the operands much be the same type as the register
3960  // class of the output.
3961  if (MI.getOpcode() == AMDGPU::PHI) {
3962  const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3963  for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
3964  if (!MI.getOperand(i).isReg() ||
3966  continue;
3967  const TargetRegisterClass *OpRC =
3968  MRI.getRegClass(MI.getOperand(i).getReg());
3969  if (RI.hasVGPRs(OpRC)) {
3970  VRC = OpRC;
3971  } else {
3972  SRC = OpRC;
3973  }
3974  }
3975 
3976  // If any of the operands are VGPR registers, then they all most be
3977  // otherwise we will create illegal VGPR->SGPR copies when legalizing
3978  // them.
3979  if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
3980  if (!VRC) {
3981  assert(SRC);
3982  VRC = RI.getEquivalentVGPRClass(SRC);
3983  }
3984  RC = VRC;
3985  } else {
3986  RC = SRC;
3987  }
3988 
3989  // Update all the operands so they have the same type.
3990  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3991  MachineOperand &Op = MI.getOperand(I);
3993  continue;
3994 
3995  // MI is a PHI instruction.
3996  MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3997  MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3998 
3999  // Avoid creating no-op copies with the same src and dst reg class. These
4000  // confuse some of the machine passes.
4001  legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
4002  }
4003  }
4004 
4005  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
4006  // VGPR dest type and SGPR sources, insert copies so all operands are
4007  // VGPRs. This seems to help operand folding / the register coalescer.
4008  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
4009  MachineBasicBlock *MBB = MI.getParent();
4010  const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
4011  if (RI.hasVGPRs(DstRC)) {
4012  // Update all the operands so they are VGPR register classes. These may
4013  // not be the same register class because REG_SEQUENCE supports mixing
4014  // subregister index types e.g. sub0_sub1 + sub2 + sub3
4015  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4016  MachineOperand &Op = MI.getOperand(I);
4018  continue;
4019 
4020  const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
4021  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
4022  if (VRC == OpRC)
4023  continue;
4024 
4025  legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
4026  Op.setIsKill();
4027  }
4028  }
4029 
4030  return;
4031  }
4032 
4033  // Legalize INSERT_SUBREG
4034  // src0 must have the same register class as dst
4035  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
4036  unsigned Dst = MI.getOperand(0).getReg();
4037  unsigned Src0 = MI.getOperand(1).getReg();
4038  const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
4039  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
4040  if (DstRC != Src0RC) {
4041  MachineBasicBlock *MBB = MI.getParent();
4042  MachineOperand &Op = MI.getOperand(1);
4043  legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
4044  }
4045  return;
4046  }
4047 
4048  // Legalize SI_INIT_M0
4049  if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
4050  MachineOperand &Src = MI.getOperand(0);
4051  if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
4052  Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
4053  return;
4054  }
4055 
4056  // Legalize MIMG and MUBUF/MTBUF for shaders.
4057  //
4058  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
4059  // scratch memory access. In both cases, the legalization never involves
4060  // conversion to the addr64 form.
4061  if (isMIMG(MI) ||
4063  (isMUBUF(MI) || isMTBUF(MI)))) {
4064  MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
4065  if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
4066  unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
4067  SRsrc->setReg(SGPR);
4068  }
4069 
4070  MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
4071  if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
4072  unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
4073  SSamp->setReg(SGPR);
4074  }
4075  return;
4076  }
4077 
4078  // Legalize MUBUF* instructions.
4079  int RsrcIdx =
4080  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
4081  if (RsrcIdx != -1) {
4082  // We have an MUBUF instruction
4083  MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
4084  unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
4085  if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
4086  RI.getRegClass(RsrcRC))) {
4087  // The operands are legal.
4088  // FIXME: We may need to legalize operands besided srsrc.
4089  return;
4090  }
4091 
4092  // Legalize a VGPR Rsrc.
4093  //
4094  // If the instruction is _ADDR64, we can avoid a waterfall by extracting
4095  // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
4096  // a zero-value SRsrc.
4097  //
4098  // If the instruction is _OFFSET (both idxen and offen disabled), and we
4099  // support ADDR64 instructions, we can convert to ADDR64 and do the same as
4100  // above.
4101  //
4102  // Otherwise we are on non-ADDR64 hardware, and/or we have
4103  // idxen/offen/bothen and we fall back to a waterfall loop.
4104 
4105  MachineBasicBlock &MBB = *MI.getParent();
4106 
4107  MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4108  if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
4109  // This is already an ADDR64 instruction so we need to add the pointer
4110  // extracted from the resource descriptor to the current value of VAddr.
4111  unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4112  unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4113  unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4114 
4115  unsigned RsrcPtr, NewSRsrc;
4116  std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4117 
4118  // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
4119  DebugLoc DL = MI.getDebugLoc();
4120  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
4121  .addReg(RsrcPtr, 0, AMDGPU::sub0)
4122  .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
4123 
4124  // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
4125  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
4126  .addReg(RsrcPtr, 0, AMDGPU::sub1)
4127  .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
4128 
4129  // NewVaddr = {NewVaddrHi, NewVaddrLo}
4130  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
4131  .addReg(NewVAddrLo)
4132  .addImm(AMDGPU::sub0)
4133  .addReg(NewVAddrHi)
4134  .addImm(AMDGPU::sub1);
4135 
4136  VAddr->setReg(NewVAddr);
4137  Rsrc->setReg(NewSRsrc);
4138  } else if (!VAddr && ST.hasAddr64()) {
4139  // This instructions is the _OFFSET variant, so we need to convert it to
4140  // ADDR64.
4141  assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
4143  "FIXME: Need to emit flat atomics here");
4144 
4145  unsigned RsrcPtr, NewSRsrc;
4146  std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4147 
4148  unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4149  MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
4150  MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4151  MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
4152  unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
4153 
4154  // Atomics rith return have have an additional tied operand and are
4155  // missing some of the special bits.
4156  MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
4157  MachineInstr *Addr64;
4158 
4159  if (!VDataIn) {
4160  // Regular buffer load / store.
4161  MachineInstrBuilder MIB =
4162  BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4163  .add(*VData)
4164  .addReg(NewVAddr)
4165  .addReg(NewSRsrc)
4166  .add(*SOffset)
4167  .add(*Offset);
4168 
4169  // Atomics do not have this operand.
4170  if (const MachineOperand *GLC =
4171  getNamedOperand(MI, AMDGPU::OpName::glc)) {
4172  MIB.addImm(GLC->getImm());
4173  }
4174 
4175  MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
4176 
4177  if (const MachineOperand *TFE =
4178  getNamedOperand(MI, AMDGPU::OpName::tfe)) {
4179  MIB.addImm(TFE->getImm());
4180  }
4181 
4182  MIB.cloneMemRefs(MI);
4183  Addr64 = MIB;
4184  } else {
4185  // Atomics with return.
4186  Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4187  .add(*VData)
4188  .add(*VDataIn)
4189  .addReg(NewVAddr)
4190  .addReg(NewSRsrc)
4191  .add(*SOffset)
4192  .add(*Offset)
4193  .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
4194  .cloneMemRefs(MI);
4195  }
4196 
4197  MI.removeFromParent();
4198 
4199  // NewVaddr = {NewVaddrHi, NewVaddrLo}
4200  BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
4201  NewVAddr)
4202  .addReg(RsrcPtr, 0, AMDGPU::sub0)
4203  .addImm(AMDGPU::sub0)
4204  .addReg(RsrcPtr, 0, AMDGPU::sub1)
4205  .addImm(AMDGPU::sub1);
4206  } else {
4207  // This is another variant; legalize Rsrc with waterfall loop from VGPRs
4208  // to SGPRs.
4209  loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
4210  }
4211  }
4212 }
4213 
4215  MachineDominatorTree *MDT) const {
4216  SetVectorType Worklist;
4217  Worklist.insert(&TopInst);
4218 
4219  while (!Worklist.empty()) {
4220  MachineInstr &Inst = *Worklist.pop_back_val();
4221  MachineBasicBlock *MBB = Inst.getParent();
4223 
4224  unsigned Opcode = Inst.getOpcode();
4225  unsigned NewOpcode = getVALUOp(Inst);
4226 
4227  // Handle some special cases
4228  switch (Opcode) {
4229  default:
4230  break;
4231  case AMDGPU::S_ADD_U64_PSEUDO:
4232  case AMDGPU::S_SUB_U64_PSEUDO:
4233  splitScalar64BitAddSub(Worklist, Inst, MDT);
4234  Inst.eraseFromParent();
4235  continue;
4236  case AMDGPU::S_ADD_I32:
4237  case AMDGPU::S_SUB_I32:
4238  // FIXME: The u32 versions currently selected use the carry.
4239  if (moveScalarAddSub(Worklist, Inst, MDT))
4240  continue;
4241 
4242  // Default handling
4243  break;
4244  case AMDGPU::S_AND_B64:
4245  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
4246  Inst.eraseFromParent();
4247  continue;
4248 
4249  case AMDGPU::S_OR_B64:
4250  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
4251  Inst.eraseFromParent();
4252  continue;
4253 
4254  case AMDGPU::S_XOR_B64:
4255  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
4256  Inst.eraseFromParent();
4257  continue;
4258 
4259  case AMDGPU::S_NAND_B64:
4260  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
4261  Inst.eraseFromParent();
4262  continue;
4263 
4264  case AMDGPU::S_NOR_B64:
4265  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
4266  Inst.eraseFromParent();
4267  continue;
4268 
4269  case AMDGPU::S_XNOR_B64:
4270  if (ST.hasDLInsts())
4271  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
4272  else
4273  splitScalar64BitXnor(Worklist, Inst, MDT);
4274  Inst.eraseFromParent();
4275  continue;
4276 
4277  case AMDGPU::S_ANDN2_B64:
4278  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
4279  Inst.eraseFromParent();
4280  continue;
4281 
4282  case AMDGPU::S_ORN2_B64:
4283  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
4284  Inst.eraseFromParent();
4285  continue;
4286 
4287  case AMDGPU::S_NOT_B64:
4288  splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
4289  Inst.eraseFromParent();
4290  continue;
4291 
4292  case AMDGPU::S_BCNT1_I32_B64:
4293  splitScalar64BitBCNT(Worklist, Inst);
4294  Inst.eraseFromParent();
4295  continue;
4296 
4297  case AMDGPU::S_BFE_I64:
4298  splitScalar64BitBFE(Worklist, Inst);
4299  Inst.eraseFromParent();
4300  continue;
4301 
4302  case AMDGPU::S_LSHL_B32:
4303  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4304  NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
4305  swapOperands(Inst);
4306  }
4307  break;
4308  case AMDGPU::S_ASHR_I32:
4309  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4310  NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
4311  swapOperands(Inst);
4312  }
4313  break;
4314  case AMDGPU::S_LSHR_B32:
4315  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4316  NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
4317  swapOperands(Inst);
4318  }
4319  break;
4320  case AMDGPU::S_LSHL_B64:
4321  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4322  NewOpcode = AMDGPU::V_LSHLREV_B64;
4323  swapOperands(Inst);
4324  }
4325  break;
4326  case AMDGPU::S_ASHR_I64:
4327  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4328  NewOpcode = AMDGPU::V_ASHRREV_I64;
4329  swapOperands(Inst);
4330  }
4331  break;
4332  case AMDGPU::S_LSHR_B64:
4333  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4334  NewOpcode = AMDGPU::V_LSHRREV_B64;
4335  swapOperands(Inst);
4336  }
4337  break;
4338 
4339  case AMDGPU::S_ABS_I32:
4340  lowerScalarAbs(Worklist, Inst);
4341  Inst.eraseFromParent();
4342  continue;
4343 
4344  case AMDGPU::S_CBRANCH_SCC0:
4345  case AMDGPU::S_CBRANCH_SCC1:
4346  // Clear unused bits of vcc
4347  BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
4348  AMDGPU::VCC)
4349  .addReg(AMDGPU::EXEC)
4350  .addReg(AMDGPU::VCC);
4351  break;
4352 
4353  case AMDGPU::S_BFE_U64:
4354  case AMDGPU::S_BFM_B64:
4355  llvm_unreachable("Moving this op to VALU not implemented");
4356 
4357  case AMDGPU::S_PACK_LL_B32_B16:
4358  case AMDGPU::S_PACK_LH_B32_B16:
4359  case AMDGPU::S_PACK_HH_B32_B16:
4360  movePackToVALU(Worklist, MRI, Inst);
4361  Inst.eraseFromParent();
4362  continue;
4363 
4364  case AMDGPU::S_XNOR_B32:
4365  lowerScalarXnor(Worklist, Inst);
4366  Inst.eraseFromParent();
4367  continue;
4368 
4369  case AMDGPU::S_NAND_B32:
4370  splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
4371  Inst.eraseFromParent();
4372  continue;
4373 
4374  case AMDGPU::S_NOR_B32:
4375  splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
4376  Inst.eraseFromParent();
4377  continue;
4378 
4379  case AMDGPU::S_ANDN2_B32:
4380  splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
4381  Inst.eraseFromParent();
4382  continue;
4383 
4384  case AMDGPU::S_ORN2_B32:
4385  splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
4386  Inst.eraseFromParent();
4387  continue;
4388  }
4389 
4390  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
4391  // We cannot move this instruction to the VALU, so we should try to
4392  // legalize its operands instead.
4393  legalizeOperands(Inst, MDT);
4394  continue;
4395  }
4396 
4397  // Use the new VALU Opcode.
4398  const MCInstrDesc &NewDesc = get(NewOpcode);
4399  Inst.setDesc(NewDesc);
4400 
4401  // Remove any references to SCC. Vector instructions can't read from it, and
4402  // We're just about to add the implicit use / defs of VCC, and we don't want
4403  // both.
4404  for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
4405  MachineOperand &Op = Inst.getOperand(i);
4406  if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
4407  // Only propagate through live-def of SCC.
4408  if (Op.isDef() && !Op.isDead())
4409  addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
4410  Inst.RemoveOperand(i);
4411  }
4412  }
4413 
4414  if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
4415  // We are converting these to a BFE, so we need to add the missing
4416  // operands for the size and offset.
4417  unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
4420 
4421  } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
4422  // The VALU version adds the second operand to the result, so insert an
4423  // extra 0 operand.
4425  }
4426 
4428 
4429  if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
4430  const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
4431  // If we need to move this to VGPRs, we need to unpack the second operand
4432  // back into the 2 separate ones for bit offset and width.
4433  assert(OffsetWidthOp.isImm() &&
4434  "Scalar BFE is only implemented for constant width and offset");
4435  uint32_t Imm = OffsetWidthOp.getImm();
4436 
4437  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4438  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4439  Inst.RemoveOperand(2); // Remove old immediate.
4440  Inst.addOperand(MachineOperand::CreateImm(Offset));
4441  Inst.addOperand(MachineOperand::CreateImm(BitWidth));
4442  }
4443 
4444  bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
4445  unsigned NewDstReg = AMDGPU::NoRegister;
4446  if (HasDst) {
4447  unsigned DstReg = Inst.getOperand(0).getReg();
4449  continue;
4450 
4451  // Update the destination register class.
4452  const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
4453  if (!NewDstRC)
4454  continue;
4455 
4456  if (Inst.isCopy() &&
4458  NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
4459  // Instead of creating a copy where src and dst are the same register
4460  // class, we just replace all uses of dst with src. These kinds of
4461  // copies interfere with the heuristics MachineSink uses to decide
4462  // whether or not to split a critical edge. Since the pass assumes
4463  // that copies will end up as machine instructions and not be
4464  // eliminated.
4465  addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
4466  MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
4467  MRI.clearKillFlags(Inst.getOperand(1).getReg());
4468  Inst.getOperand(0).setReg(DstReg);
4469 
4470  // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
4471  // these are deleted later, but at -O0 it would leave a suspicious
4472  // looking illegal copy of an undef register.
4473  for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
4474  Inst.RemoveOperand(I);
4475  Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
4476  continue;
4477  }
4478 
4479  NewDstReg = MRI.createVirtualRegister(NewDstRC);
4480  MRI.replaceRegWith(DstReg, NewDstReg);
4481  }
4482 
4483  // Legalize the operands
4484  legalizeOperands(Inst, MDT);
4485 
4486  if (HasDst)
4487  addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
4488  }
4489 }
4490 
4491 // Add/sub require special handling to deal with carry outs.
4492 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
4493  MachineDominatorTree *MDT) const {
4494  if (ST.hasAddNoCarry()) {
4495  // Assume there is no user of scc since we don't select this in that case.
4496  // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
4497  // is used.
4498 
4499  MachineBasicBlock &MBB = *Inst.getParent();
4501 
4502  unsigned OldDstReg = Inst.getOperand(0).getReg();
4503  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4504 
4505  unsigned Opc = Inst.getOpcode();
4506  assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
4507 
4508  unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
4509  AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
4510 
4511  assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
4512  Inst.RemoveOperand(3);
4513 
4514  Inst.setDesc(get(NewOpc));
4515  Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
4516  Inst.addImplicitDefUseOperands(*MBB.getParent());
4517  MRI.replaceRegWith(OldDstReg, ResultReg);
4518  legalizeOperands(Inst, MDT);
4519 
4520  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4521  return true;
4522  }
4523 
4524  return false;
4525 }
4526 
4527 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
4528  MachineInstr &Inst) const {
4529  MachineBasicBlock &MBB = *Inst.getParent();
4531  MachineBasicBlock::iterator MII = Inst;
4532  DebugLoc DL = Inst.getDebugLoc();
4533 
4534  MachineOperand &Dest = Inst.getOperand(0);
4535  MachineOperand &Src = Inst.getOperand(1);
4536  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4537  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4538 
4539  unsigned SubOp = ST.hasAddNoCarry() ?
4540  AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
4541 
4542  BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
4543  .addImm(0)
4544  .addReg(Src.getReg());
4545 
4546  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
4547  .addReg(Src.getReg())
4548  .addReg(TmpReg);
4549 
4550  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4551  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4552 }
4553 
4554 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
4555  MachineInstr &Inst) const {
4556  MachineBasicBlock &MBB = *Inst.getParent();
4558  MachineBasicBlock::iterator MII = Inst;
4559  const DebugLoc &DL = Inst.getDebugLoc();
4560 
4561  MachineOperand &Dest = Inst.getOperand(0);
4562  MachineOperand &Src0 = Inst.getOperand(1);
4563  MachineOperand &Src1 = Inst.getOperand(2);
4564 
4565  if (ST.hasDLInsts()) {
4566  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4567  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
4568  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
4569 
4570  BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
4571  .add(Src0)
4572  .add(Src1);
4573 
4574  MRI.replaceRegWith(Dest.getReg(), NewDest);
4575  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4576  } else {
4577  // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
4578  // invert either source and then perform the XOR. If either source is a
4579  // scalar register, then we can leave the inversion on the scalar unit to
4580  // acheive a better distrubution of scalar and vector instructions.
4581  bool Src0IsSGPR = Src0.isReg() &&
4582  RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
4583  bool Src1IsSGPR = Src1.isReg() &&
4584  RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
4585  MachineInstr *Not = nullptr;
4586  MachineInstr *Xor = nullptr;
4587  unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4588  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4589 
4590  // Build a pair of scalar instructions and add them to the work list.
4591  // The next iteration over the work list will lower these to the vector
4592  // unit as necessary.
4593  if (Src0IsSGPR) {
4594  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
4595  .add(Src0);
4596  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
4597  .addReg(Temp)
4598  .add(Src1);
4599  } else if (Src1IsSGPR) {
4600  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
4601  .add(Src1);
4602  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
4603  .add(Src0)
4604  .addReg(Temp);
4605  } else {
4606  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
4607  .add(Src0)
4608  .add(Src1);
4609  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
4610  .addReg(Temp);
4611  Worklist.insert(Not);
4612  }
4613 
4614  MRI.replaceRegWith(Dest.getReg(), NewDest);
4615 
4616  Worklist.insert(Xor);
4617 
4618  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4619  }
4620 }
4621 
4622 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
4623  MachineInstr &Inst,
4624  unsigned Opcode) const {
4625  MachineBasicBlock &MBB = *Inst.getParent();
4627  MachineBasicBlock::iterator MII = Inst;
4628  const DebugLoc &DL = Inst.getDebugLoc();
4629 
4630  MachineOperand &Dest = Inst.getOperand(0);
4631  MachineOperand &Src0 = Inst.getOperand(1);
4632  MachineOperand &Src1 = Inst.getOperand(2);
4633 
4634  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4635  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4636 
4637  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
4638  .add(Src0)
4639  .add(Src1);
4640 
4641  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
4642  .addReg(Interm);
4643 
4644  Worklist.insert(&Op);
4645  Worklist.insert(&Not);
4646 
4647  MRI.replaceRegWith(Dest.getReg(), NewDest);
4648  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4649 }
4650 
4651 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
4652  MachineInstr &Inst,
4653  unsigned Opcode) const {
4654  MachineBasicBlock &MBB = *Inst.getParent();
4656  MachineBasicBlock::iterator MII = Inst;
4657  const DebugLoc &DL = Inst.getDebugLoc();
4658 
4659  MachineOperand &Dest = Inst.getOperand(0);
4660  MachineOperand &Src0 = Inst.getOperand(1);
4661  MachineOperand &Src1 = Inst.getOperand(2);
4662 
4663  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4664  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4665 
4666  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
4667  .add(Src1);
4668 
4669  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
4670  .add(Src0)
4671  .addReg(Interm);
4672 
4673  Worklist.insert(&Not);
4674  Worklist.insert(&Op);
4675 
4676  MRI.replaceRegWith(Dest.getReg(), NewDest);
4677  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4678 }
4679 
4680 void SIInstrInfo::splitScalar64BitUnaryOp(
4681  SetVectorType &Worklist, MachineInstr &Inst,
4682  unsigned Opcode) const {
4683  MachineBasicBlock &MBB = *Inst.getParent();
4685 
4686  MachineOperand &Dest = Inst.getOperand(0);
4687  MachineOperand &Src0 = Inst.getOperand(1);
4688  DebugLoc DL = Inst.getDebugLoc();
4689 
4690  MachineBasicBlock::iterator MII = Inst;
4691 
4692  const MCInstrDesc &InstDesc = get(Opcode);
4693  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4694  MRI.getRegClass(Src0.getReg()) :
4695  &AMDGPU::SGPR_32RegClass;
4696 
4697  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4698 
4699  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4700  AMDGPU::sub0, Src0SubRC);
4701 
4702  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4703  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4704  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4705 
4706  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4707  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
4708 
4709  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4710  AMDGPU::sub1, Src0SubRC);
4711 
4712  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4713  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
4714 
4715  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4716  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4717  .addReg(DestSub0)
4718  .addImm(AMDGPU::sub0)
4719  .addReg(DestSub1)
4720  .addImm(AMDGPU::sub1);
4721 
4722  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4723 
4724  Worklist.insert(&LoHalf);
4725  Worklist.insert(&HiHalf);
4726 
4727  // We don't need to legalizeOperands here because for a single operand, src0
4728  // will support any kind of input.
4729 
4730  // Move all users of this moved value.
4731  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4732 }
4733 
4734 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
4735  MachineInstr &Inst,
4736  MachineDominatorTree *MDT) const {
4737  bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4738 
4739  MachineBasicBlock &MBB = *Inst.getParent();
4741 
4742  unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4743  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4744  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4745 
4746  unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4747  unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4748 
4749  MachineOperand &Dest = Inst.getOperand(0);
4750  MachineOperand &Src0 = Inst.getOperand(1);
4751  MachineOperand &Src1 = Inst.getOperand(2);
4752  const DebugLoc &DL = Inst.getDebugLoc();
4753  MachineBasicBlock::iterator MII = Inst;
4754 
4755  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
4756  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
4757  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4758  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4759 
4760  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4761  AMDGPU::sub0, Src0SubRC);
4762  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4763  AMDGPU::sub0, Src1SubRC);
4764 
4765 
4766  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4767  AMDGPU::sub1, Src0SubRC);
4768  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4769  AMDGPU::sub1, Src1SubRC);
4770 
4771  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
4772  MachineInstr *LoHalf =
4773  BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
4774  .addReg(CarryReg, RegState::Define)
4775  .add(SrcReg0Sub0)
4776  .add(SrcReg1Sub0)
4777  .addImm(0); // clamp bit
4778 
4779  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4780  MachineInstr *HiHalf =
4781  BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
4782  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4783  .add(SrcReg0Sub1)
4784  .add(SrcReg1Sub1)
4785  .addReg(CarryReg, RegState::Kill)
4786  .addImm(0); // clamp bit
4787 
4788  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4789  .addReg(DestSub0)
4790  .addImm(AMDGPU::sub0)
4791  .addReg(DestSub1)
4792  .addImm(AMDGPU::sub1);
4793 
4794  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4795 
4796  // Try to legalize the operands in case we need to swap the order to keep it
4797  // valid.
4798  legalizeOperands(*LoHalf, MDT);
4799  legalizeOperands(*HiHalf, MDT);
4800 
4801  // Move all users of this moved vlaue.
4802  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4803 }
4804 
4805 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
4806  MachineInstr &Inst, unsigned Opcode,
4807  MachineDominatorTree *MDT) const {
4808  MachineBasicBlock &MBB = *Inst.getParent();
4810 
4811  MachineOperand &Dest = Inst.getOperand(0);
4812  MachineOperand &Src0 = Inst.getOperand(1);
4813  MachineOperand &Src1 = Inst.getOperand(2);
4814  DebugLoc DL = Inst.getDebugLoc();
4815 
4816  MachineBasicBlock::iterator MII = Inst;
4817 
4818  const MCInstrDesc &InstDesc = get(Opcode);
4819  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4820  MRI.getRegClass(Src0.getReg()) :
4821  &AMDGPU::SGPR_32RegClass;
4822 
4823  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4824  const TargetRegisterClass *Src1RC = Src1.isReg() ?
4825  MRI.getRegClass(Src1.getReg()) :
4826  &AMDGPU::SGPR_32RegClass;
4827 
4828  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4829 
4830  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4831  AMDGPU::sub0, Src0SubRC);
4832  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4833  AMDGPU::sub0, Src1SubRC);
4834  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4835  AMDGPU::sub1, Src0SubRC);
4836  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4837  AMDGPU::sub1, Src1SubRC);
4838 
4839  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4840  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4841  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4842 
4843  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4844  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
4845  .add(SrcReg0Sub0)
4846  .add(SrcReg1Sub0);
4847 
4848  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4849  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
4850  .add(SrcReg0Sub1)
4851  .add(SrcReg1Sub1);
4852 
4853  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4854  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4855  .addReg(DestSub0)
4856  .addImm(AMDGPU::sub0)
4857  .addReg(DestSub1)
4858  .addImm(AMDGPU::sub1);
4859 
4860  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4861 
4862  Worklist.insert(&LoHalf);
4863  Worklist.insert(&HiHalf);
4864 
4865  // Move all users of this moved vlaue.
4866  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4867 }
4868 
4869 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
4870  MachineInstr &Inst,
4871  MachineDominatorTree *MDT) const {
4872  MachineBasicBlock &MBB = *Inst.getParent();
4874 
4875  MachineOperand &Dest = Inst.getOperand(0);
4876  MachineOperand &Src0 = Inst.getOperand(1);
4877  MachineOperand &Src1 = Inst.getOperand(2);
4878  const DebugLoc &DL = Inst.getDebugLoc();
4879 
4880  MachineBasicBlock::iterator MII = Inst;
4881 
4882  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4883 
4884  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4885 
4886  MachineOperand* Op0;
4887  MachineOperand* Op1;
4888 
4889  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
4890  Op0 = &Src0;
4891  Op1 = &Src1;
4892  } else {
4893  Op0 = &Src1;
4894  Op1 = &Src0;
4895  }
4896 
4897  BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
4898  .add(*Op0);
4899 
4900  unsigned NewDest = MRI.createVirtualRegister(DestRC);
4901 
4902  MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
4903  .addReg(Interm)
4904  .add(*Op1);
4905 
4906  MRI.replaceRegWith(Dest.getReg(), NewDest);
4907 
4908  Worklist.insert(&Xor);
4909 }
4910 
4911 void SIInstrInfo::splitScalar64BitBCNT(
4912  SetVectorType &Worklist, MachineInstr &Inst) const {
4913  MachineBasicBlock &MBB = *Inst.getParent();
4915 
4916  MachineBasicBlock::iterator MII = Inst;
4917  const DebugLoc &DL = Inst.getDebugLoc();
4918 
4919  MachineOperand &Dest = Inst.getOperand(0);
4920  MachineOperand &Src = Inst.getOperand(1);
4921 
4922  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
4923  const TargetRegisterClass *SrcRC = Src.isReg() ?
4924  MRI.getRegClass(Src.getReg()) :
4925  &AMDGPU::SGPR_32RegClass;
4926 
4927  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4928  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4929 
4930  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
4931 
4932  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4933  AMDGPU::sub0, SrcSubRC);
4934  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4935  AMDGPU::sub1, SrcSubRC);
4936 
4937  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
4938 
4939  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
4940 
4941  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4942 
4943  // We don't need to legalize operands here. src0 for etiher instruction can be
4944  // an SGPR, and the second input is unused or determined here.
4945  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4946 }
4947 
4948 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
4949  MachineInstr &Inst) const {
4950  MachineBasicBlock &MBB = *Inst.getParent();
4952  MachineBasicBlock::iterator MII = Inst;
4953  const DebugLoc &DL = Inst.getDebugLoc();
4954 
4955  MachineOperand &Dest = Inst.getOperand(0);
4956  uint32_t Imm = Inst.getOperand(2).getImm();
4957  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4958  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4959 
4960  (void) Offset;
4961 
4962  // Only sext_inreg cases handled.
4963  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
4964  Offset == 0 && "Not implemented");
4965 
4966  if (BitWidth < 32) {
4967  unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4968  unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4969  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4970 
4971  BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
4972  .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
4973  .addImm(0)
4974  .addImm(BitWidth);
4975 
4976  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
4977  .addImm(31)
4978  .addReg(MidRegLo);
4979 
4980  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4981  .addReg(MidRegLo)
4982  .addImm(AMDGPU::sub0)
4983  .addReg(MidRegHi)
4984  .addImm(AMDGPU::sub1);
4985 
4986  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4987  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4988  return;
4989  }
4990 
4991  MachineOperand &Src = Inst.getOperand(1);
4992  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4993  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4994 
4995  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
4996  .addImm(31)
4997  .addReg(Src.getReg(), 0, AMDGPU::sub0);
4998 
4999  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
5000  .addReg(Src.getReg(), 0, AMDGPU::sub0)
5001  .addImm(AMDGPU::sub0)
5002  .addReg(TmpReg)
5003  .addImm(AMDGPU::sub1);
5004 
5005  MRI.replaceRegWith(Dest.getReg(), ResultReg);
5006  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
5007 }
5008 
5009 void SIInstrInfo::addUsersToMoveToVALUWorklist(
5010  unsigned DstReg,
5012  SetVectorType &Worklist) const {
5013  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
5014  E = MRI.use_end(); I != E;) {
5015  MachineInstr &UseMI = *I->getParent();
5016 
5017  unsigned OpNo = 0;
5018 
5019  switch (UseMI.getOpcode()) {
5020  case AMDGPU::COPY:
5021  case AMDGPU::WQM:
5022  case AMDGPU::WWM:
5023  case AMDGPU::REG_SEQUENCE:
5024  case AMDGPU::PHI:
5025  case AMDGPU::INSERT_SUBREG:
5026  break;
5027  default:
5028  OpNo = I.getOperandNo();
5029  break;
5030  }
5031 
5032  if (!RI.hasVGPRs(getOpRegClass(UseMI, OpNo))) {
5033  Worklist.insert(&UseMI);
5034 
5035  do {
5036  ++I;
5037  } while (I != E && I->getParent() == &UseMI);
5038  } else {
5039  ++I;
5040  }
5041  }
5042 }
5043 
5044 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
5045  MachineRegisterInfo &MRI,
5046  MachineInstr &Inst) const {
5047  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5048  MachineBasicBlock *MBB = Inst.getParent();
5049  MachineOperand &Src0 = Inst.getOperand(1);
5050  MachineOperand &Src1 = Inst.getOperand(2);
5051  const DebugLoc &DL = Inst.getDebugLoc();
5052 
5053  switch (Inst.getOpcode()) {
5054  case AMDGPU::S_PACK_LL_B32_B16: {
5055  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5056  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5057 
5058  // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
5059  // 0.
5060  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5061  .addImm(0xffff);
5062 
5063  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
5064  .addReg(ImmReg, RegState::Kill)
5065  .add(Src0);
5066 
5067  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
5068  .add(Src1)
5069  .addImm(16)
5070  .addReg(TmpReg, RegState::Kill);
5071  break;
5072  }
5073  case AMDGPU::S_PACK_LH_B32_B16: {
5074  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5075  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5076  .addImm(0xffff);
5077  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
5078  .addReg(ImmReg, RegState::Kill)
5079  .add(Src0)
5080  .add(Src1);
5081  break;
5082  }
5083  case AMDGPU::S_PACK_HH_B32_B16: {
5084  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5085  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5086  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
5087  .addImm(16)
5088  .add(Src0);
5089  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5090  .addImm(0xffff0000);
5091  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
5092  .add(Src1)
5093  .addReg(ImmReg, RegState::Kill)
5094  .addReg(TmpReg, RegState::Kill);
5095  break;
5096  }
5097  default:
5098  llvm_unreachable("unhandled s_pack_* instruction");
5099  }
5100 
5101  MachineOperand &Dest = Inst.getOperand(0);
5102  MRI.replaceRegWith(Dest.getReg(), ResultReg);
5103  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
5104 }
5105 
5106 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
5107  MachineInstr &SCCDefInst,
5108  SetVectorType &Worklist) const {
5109  // Ensure that def inst defines SCC, which is still live.
5110  assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
5111  !Op.isDead() && Op.getParent() == &SCCDefInst);
5112  // This assumes that all the users of SCC are in the same block
5113  // as the SCC def.
5114  for (MachineInstr &MI : // Skip the def inst itself.
5115  make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
5116  SCCDefInst.getParent()->end())) {
5117  // Check if SCC is used first.
5118  if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
5119  Worklist.insert(&MI);
5120  // Exit if we find another SCC def.
5121  if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
5122  return;
5123  }
5124 }
5125 
5126 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
5127  const MachineInstr &Inst) const {
5128  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
5129 
5130  switch (Inst.getOpcode()) {
5131  // For target instructions, getOpRegClass just returns the virtual register
5132  // class associated with the operand, so we need to find an equivalent VGPR
5133  // register class in order to move the instruction to the VALU.
5134  case AMDGPU::COPY:
5135  case AMDGPU::PHI:
5136  case AMDGPU::REG_SEQUENCE:
5137  case AMDGPU::INSERT_SUBREG:
5138  case AMDGPU::WQM:
5139  case AMDGPU::WWM:
5140  if (RI.hasVGPRs(NewDstRC))
5141  return nullptr;
5142 
5143  NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
5144  if (!NewDstRC)
5145  return nullptr;
5146  return NewDstRC;
5147  default:
5148  return NewDstRC;
5149  }
5150 }
5151 
5152 // Find the one SGPR operand we are allowed to use.
5153 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
5154  int OpIndices[3]) const {
5155  const MCInstrDesc &Desc = MI.getDesc();
5156 
5157  // Find the one SGPR operand we are allowed to use.
5158  //
5159  // First we need to consider the instruction's operand requirements before
5160  // legalizing. Some operands are required to be SGPRs, such as implicit uses
5161  // of VCC, but we are still bound by the constant bus requirement to only use
5162  // one.
5163  //
5164  // If the operand's class is an SGPR, we can never move it.
5165 
5166  unsigned SGPRReg = findImplicitSGPRRead(MI);
5167  if (SGPRReg != AMDGPU::NoRegister)
5168  return SGPRReg;
5169 
5170  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
5171  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5172 
5173  for (unsigned i = 0; i < 3; ++i) {
5174  int Idx = OpIndices[i];
5175  if (Idx == -1)
5176  break;
5177 
5178  const MachineOperand &MO = MI.getOperand(Idx);
5179  if (!MO.isReg())
5180  continue;
5181 
5182  // Is this operand statically required to be an SGPR based on the operand
5183  // constraints?
5184  const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
5185  bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
5186  if (IsRequiredSGPR)
5187  return MO.getReg();
5188 
5189  // If this could be a VGPR or an SGPR, Check the dynamic register class.
5190  unsigned Reg = MO.getReg();
5191  const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
5192  if (RI.isSGPRClass(RegRC))
5193  UsedSGPRs[i] = Reg;
5194  }
5195 
5196  // We don't have a required SGPR operand, so we have a bit more freedom in
5197  // selecting operands to move.
5198 
5199  // Try to select the most used SGPR. If an SGPR is equal to one of the
5200  // others, we choose that.
5201  //
5202  // e.g.
5203  // V_FMA_F32 v0, s0, s0, s0 -> No moves
5204  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
5205 
5206  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
5207  // prefer those.
5208 
5209  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
5210  if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
5211  SGPRReg = UsedSGPRs[0];
5212  }
5213 
5214  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
5215  if (UsedSGPRs[1] == UsedSGPRs[2])
5216  SGPRReg = UsedSGPRs[1];
5217  }
5218 
5219  return SGPRReg;
5220 }
5221 
5223  unsigned OperandName) const {
5224  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
5225  if (Idx == -1)
5226  return nullptr;
5227 
5228  return &MI.getOperand(Idx);
5229 }
5230 
5232  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
5233  if (ST.isAmdHsaOS()) {
5234  // Set ATC = 1. GFX9 doesn't have this bit.
5235  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5236  RsrcDataFormat |= (1ULL << 56);
5237 
5238  // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
5239  // BTW, it disables TC L2 and therefore decreases performance.
5240  if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
5241  RsrcDataFormat |= (2ULL << 59);
5242  }
5243 
5244  return RsrcDataFormat;
5245 }
5246 
5248  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
5250  0xffffffff; // Size;
5251 
5252  // GFX9 doesn't have ELEMENT_SIZE.
5253  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5254  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
5255  Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
5256  }
5257 
5258  // IndexStride = 64.
5259  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
5260 
5261  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
5262  // Clear them unless we want a huge stride.
5263  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5264  Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
5265 
5266  return Rsrc23;
5267 }
5268 
5270  unsigned Opc = MI.getOpcode();
5271 
5272  return isSMRD(Opc);
5273 }
5274 
5276  unsigned Opc = MI.getOpcode();
5277 
5278  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
5279 }
5280 
5282  int &FrameIndex) const {
5283  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
5284  if (!Addr || !Addr->isFI())
5285  return AMDGPU::NoRegister;
5286 
5287  assert(!MI.memoperands_empty() &&
5289 
5290  FrameIndex = Addr->getIndex();
5291  return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
5292 }
5293 
5295  int &FrameIndex) const {
5296  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
5297  assert(Addr && Addr->isFI());
5298  FrameIndex = Addr->getIndex();
5299  return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
5300 }
5301