LLVM  10.0.0svn
SIInstrInfo.cpp
Go to the documentation of this file.
1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Implementation of TargetInstrInfo.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIInstrInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "GCNHazardRecognizer.h"
18 #include "SIDefines.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/StringRef.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/IR/InlineAsm.h"
50 #include "llvm/IR/LLVMContext.h"
51 #include "llvm/MC/MCInstrDesc.h"
52 #include "llvm/Support/Casting.h"
54 #include "llvm/Support/Compiler.h"
59 #include <cassert>
60 #include <cstdint>
61 #include <iterator>
62 #include <utility>
63 
64 using namespace llvm;
65 
66 #define GET_INSTRINFO_CTOR_DTOR
67 #include "AMDGPUGenInstrInfo.inc"
68 
69 namespace llvm {
70 namespace AMDGPU {
71 #define GET_D16ImageDimIntrinsics_IMPL
72 #define GET_ImageDimIntrinsicTable_IMPL
73 #define GET_RsrcIntrinsics_IMPL
74 #include "AMDGPUGenSearchableTables.inc"
75 }
76 }
77 
78 
79 // Must be at least 4 to be able to branch over minimum unconditional branch
80 // code. This is only for making it possible to write reasonably small tests for
81 // long branches.
82 static cl::opt<unsigned>
83 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
84  cl::desc("Restrict range of branch instructions (DEBUG)"));
85 
87  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
88  RI(ST), ST(ST) {}
89 
90 //===----------------------------------------------------------------------===//
91 // TargetInstrInfo callbacks
92 //===----------------------------------------------------------------------===//
93 
94 static unsigned getNumOperandsNoGlue(SDNode *Node) {
95  unsigned N = Node->getNumOperands();
96  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
97  --N;
98  return N;
99 }
100 
101 /// Returns true if both nodes have the same value for the given
102 /// operand \p Op, or if both nodes do not have this operand.
103 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
104  unsigned Opc0 = N0->getMachineOpcode();
105  unsigned Opc1 = N1->getMachineOpcode();
106 
107  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
108  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
109 
110  if (Op0Idx == -1 && Op1Idx == -1)
111  return true;
112 
113 
114  if ((Op0Idx == -1 && Op1Idx != -1) ||
115  (Op1Idx == -1 && Op0Idx != -1))
116  return false;
117 
118  // getNamedOperandIdx returns the index for the MachineInstr's operands,
119  // which includes the result as the first operand. We are indexing into the
120  // MachineSDNode's operands, so we need to skip the result operand to get
121  // the real index.
122  --Op0Idx;
123  --Op1Idx;
124 
125  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
126 }
127 
129  AliasAnalysis *AA) const {
130  // TODO: The generic check fails for VALU instructions that should be
131  // rematerializable due to implicit reads of exec. We really want all of the
132  // generic logic for this except for this.
133  switch (MI.getOpcode()) {
134  case AMDGPU::V_MOV_B32_e32:
135  case AMDGPU::V_MOV_B32_e64:
136  case AMDGPU::V_MOV_B64_PSEUDO:
137  // No implicit operands.
138  return MI.getNumOperands() == MI.getDesc().getNumOperands();
139  default:
140  return false;
141  }
142 }
143 
145  int64_t &Offset0,
146  int64_t &Offset1) const {
147  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
148  return false;
149 
150  unsigned Opc0 = Load0->getMachineOpcode();
151  unsigned Opc1 = Load1->getMachineOpcode();
152 
153  // Make sure both are actually loads.
154  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
155  return false;
156 
157  if (isDS(Opc0) && isDS(Opc1)) {
158 
159  // FIXME: Handle this case:
160  if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
161  return false;
162 
163  // Check base reg.
164  if (Load0->getOperand(0) != Load1->getOperand(0))
165  return false;
166 
167  // Skip read2 / write2 variants for simplicity.
168  // TODO: We should report true if the used offsets are adjacent (excluded
169  // st64 versions).
170  int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
171  int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
172  if (Offset0Idx == -1 || Offset1Idx == -1)
173  return false;
174 
175  // XXX - be careful of datalesss loads
176  // getNamedOperandIdx returns the index for MachineInstrs. Since they
177  // include the output in the operand list, but SDNodes don't, we need to
178  // subtract the index by one.
179  Offset0Idx -= get(Opc0).NumDefs;
180  Offset1Idx -= get(Opc1).NumDefs;
181  Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
182  Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
183  return true;
184  }
185 
186  if (isSMRD(Opc0) && isSMRD(Opc1)) {
187  // Skip time and cache invalidation instructions.
188  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
189  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
190  return false;
191 
193 
194  // Check base reg.
195  if (Load0->getOperand(0) != Load1->getOperand(0))
196  return false;
197 
198  const ConstantSDNode *Load0Offset =
199  dyn_cast<ConstantSDNode>(Load0->getOperand(1));
200  const ConstantSDNode *Load1Offset =
201  dyn_cast<ConstantSDNode>(Load1->getOperand(1));
202 
203  if (!Load0Offset || !Load1Offset)
204  return false;
205 
206  Offset0 = Load0Offset->getZExtValue();
207  Offset1 = Load1Offset->getZExtValue();
208  return true;
209  }
210 
211  // MUBUF and MTBUF can access the same addresses.
212  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
213 
214  // MUBUF and MTBUF have vaddr at different indices.
215  if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
216  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
217  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
218  return false;
219 
220  int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
221  int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
222 
223  if (OffIdx0 == -1 || OffIdx1 == -1)
224  return false;
225 
226  // getNamedOperandIdx returns the index for MachineInstrs. Since they
227  // include the output in the operand list, but SDNodes don't, we need to
228  // subtract the index by one.
229  OffIdx0 -= get(Opc0).NumDefs;
230  OffIdx1 -= get(Opc1).NumDefs;
231 
232  SDValue Off0 = Load0->getOperand(OffIdx0);
233  SDValue Off1 = Load1->getOperand(OffIdx1);
234 
235  // The offset might be a FrameIndexSDNode.
236  if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
237  return false;
238 
239  Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
240  Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
241  return true;
242  }
243 
244  return false;
245 }
246 
247 static bool isStride64(unsigned Opc) {
248  switch (Opc) {
249  case AMDGPU::DS_READ2ST64_B32:
250  case AMDGPU::DS_READ2ST64_B64:
251  case AMDGPU::DS_WRITE2ST64_B32:
252  case AMDGPU::DS_WRITE2ST64_B64:
253  return true;
254  default:
255  return false;
256  }
257 }
258 
260  const MachineOperand *&BaseOp,
261  int64_t &Offset,
262  const TargetRegisterInfo *TRI) const {
263  unsigned Opc = LdSt.getOpcode();
264 
265  if (isDS(LdSt)) {
266  const MachineOperand *OffsetImm =
267  getNamedOperand(LdSt, AMDGPU::OpName::offset);
268  if (OffsetImm) {
269  // Normal, single offset LDS instruction.
270  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
271  // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
272  // report that here?
273  if (!BaseOp)
274  return false;
275 
276  Offset = OffsetImm->getImm();
277  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
278  "operands of type register.");
279  return true;
280  }
281 
282  // The 2 offset instructions use offset0 and offset1 instead. We can treat
283  // these as a load with a single offset if the 2 offsets are consecutive. We
284  // will use this for some partially aligned loads.
285  const MachineOperand *Offset0Imm =
286  getNamedOperand(LdSt, AMDGPU::OpName::offset0);
287  const MachineOperand *Offset1Imm =
288  getNamedOperand(LdSt, AMDGPU::OpName::offset1);
289 
290  uint8_t Offset0 = Offset0Imm->getImm();
291  uint8_t Offset1 = Offset1Imm->getImm();
292 
293  if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
294  // Each of these offsets is in element sized units, so we need to convert
295  // to bytes of the individual reads.
296 
297  unsigned EltSize;
298  if (LdSt.mayLoad())
299  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
300  else {
301  assert(LdSt.mayStore());
302  int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
303  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
304  }
305 
306  if (isStride64(Opc))
307  EltSize *= 64;
308 
309  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
310  Offset = EltSize * Offset0;
311  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
312  "operands of type register.");
313  return true;
314  }
315 
316  return false;
317  }
318 
319  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
320  const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
321  if (SOffset && SOffset->isReg())
322  return false;
323 
324  const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
325  if (!AddrReg)
326  return false;
327 
328  const MachineOperand *OffsetImm =
329  getNamedOperand(LdSt, AMDGPU::OpName::offset);
330  BaseOp = AddrReg;
331  Offset = OffsetImm->getImm();
332 
333  if (SOffset) // soffset can be an inline immediate.
334  Offset += SOffset->getImm();
335 
336  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
337  "operands of type register.");
338  return true;
339  }
340 
341  if (isSMRD(LdSt)) {
342  const MachineOperand *OffsetImm =
343  getNamedOperand(LdSt, AMDGPU::OpName::offset);
344  if (!OffsetImm)
345  return false;
346 
347  const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
348  BaseOp = SBaseReg;
349  Offset = OffsetImm->getImm();
350  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
351  "operands of type register.");
352  return true;
353  }
354 
355  if (isFLAT(LdSt)) {
356  const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
357  if (VAddr) {
358  // Can't analyze 2 offsets.
359  if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
360  return false;
361 
362  BaseOp = VAddr;
363  } else {
364  // scratch instructions have either vaddr or saddr.
365  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
366  }
367 
368  Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
369  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
370  "operands of type register.");
371  return true;
372  }
373 
374  return false;
375 }
376 
377 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
378  const MachineOperand &BaseOp1,
379  const MachineInstr &MI2,
380  const MachineOperand &BaseOp2) {
381  // Support only base operands with base registers.
382  // Note: this could be extended to support FI operands.
383  if (!BaseOp1.isReg() || !BaseOp2.isReg())
384  return false;
385 
386  if (BaseOp1.isIdenticalTo(BaseOp2))
387  return true;
388 
389  if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
390  return false;
391 
392  auto MO1 = *MI1.memoperands_begin();
393  auto MO2 = *MI2.memoperands_begin();
394  if (MO1->getAddrSpace() != MO2->getAddrSpace())
395  return false;
396 
397  auto Base1 = MO1->getValue();
398  auto Base2 = MO2->getValue();
399  if (!Base1 || !Base2)
400  return false;
401  const MachineFunction &MF = *MI1.getParent()->getParent();
402  const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
403  Base1 = GetUnderlyingObject(Base1, DL);
404  Base2 = GetUnderlyingObject(Base1, DL);
405 
406  if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
407  return false;
408 
409  return Base1 == Base2;
410 }
411 
413  const MachineOperand &BaseOp2,
414  unsigned NumLoads) const {
415  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
416  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
417 
418  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
419  return false;
420 
421  const MachineOperand *FirstDst = nullptr;
422  const MachineOperand *SecondDst = nullptr;
423 
424  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
425  (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
426  (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
427  const unsigned MaxGlobalLoadCluster = 6;
428  if (NumLoads > MaxGlobalLoadCluster)
429  return false;
430 
431  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
432  if (!FirstDst)
433  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
434  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
435  if (!SecondDst)
436  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
437  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
438  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
439  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
440  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
441  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
442  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
443  }
444 
445  if (!FirstDst || !SecondDst)
446  return false;
447 
448  // Try to limit clustering based on the total number of bytes loaded
449  // rather than the number of instructions. This is done to help reduce
450  // register pressure. The method used is somewhat inexact, though,
451  // because it assumes that all loads in the cluster will load the
452  // same number of bytes as FirstLdSt.
453 
454  // The unit of this value is bytes.
455  // FIXME: This needs finer tuning.
456  unsigned LoadClusterThreshold = 16;
457 
458  const MachineRegisterInfo &MRI =
459  FirstLdSt.getParent()->getParent()->getRegInfo();
460 
461  const Register Reg = FirstDst->getReg();
462 
464  ? MRI.getRegClass(Reg)
465  : RI.getPhysRegClass(Reg);
466 
467  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
468 }
469 
470 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
471 // the first 16 loads will be interleaved with the stores, and the next 16 will
472 // be clustered as expected. It should really split into 2 16 store batches.
473 //
474 // Loads are clustered until this returns false, rather than trying to schedule
475 // groups of stores. This also means we have to deal with saying different
476 // address space loads should be clustered, and ones which might cause bank
477 // conflicts.
478 //
479 // This might be deprecated so it might not be worth that much effort to fix.
481  int64_t Offset0, int64_t Offset1,
482  unsigned NumLoads) const {
483  assert(Offset1 > Offset0 &&
484  "Second offset should be larger than first offset!");
485  // If we have less than 16 loads in a row, and the offsets are within 64
486  // bytes, then schedule together.
487 
488  // A cacheline is 64 bytes (for global memory).
489  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
490 }
491 
494  const DebugLoc &DL, unsigned DestReg,
495  unsigned SrcReg, bool KillSrc) {
496  MachineFunction *MF = MBB.getParent();
497  DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
498  "illegal SGPR to VGPR copy",
499  DL, DS_Error);
500  LLVMContext &C = MF->getFunction().getContext();
501  C.diagnose(IllegalCopy);
502 
503  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
504  .addReg(SrcReg, getKillRegState(KillSrc));
505 }
506 
509  const DebugLoc &DL, unsigned DestReg,
510  unsigned SrcReg, bool KillSrc) const {
511  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
512 
513  if (RC == &AMDGPU::VGPR_32RegClass) {
514  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
515  AMDGPU::SReg_32RegClass.contains(SrcReg) ||
516  AMDGPU::AGPR_32RegClass.contains(SrcReg));
517  unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
518  AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32;
519  BuildMI(MBB, MI, DL, get(Opc), DestReg)
520  .addReg(SrcReg, getKillRegState(KillSrc));
521  return;
522  }
523 
524  if (RC == &AMDGPU::SReg_32_XM0RegClass ||
525  RC == &AMDGPU::SReg_32RegClass) {
526  if (SrcReg == AMDGPU::SCC) {
527  BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
528  .addImm(-1)
529  .addImm(0);
530  return;
531  }
532 
533  if (DestReg == AMDGPU::VCC_LO) {
534  if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
535  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
536  .addReg(SrcReg, getKillRegState(KillSrc));
537  } else {
538  // FIXME: Hack until VReg_1 removed.
539  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
540  BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
541  .addImm(0)
542  .addReg(SrcReg, getKillRegState(KillSrc));
543  }
544 
545  return;
546  }
547 
548  if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
549  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
550  return;
551  }
552 
553  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
554  .addReg(SrcReg, getKillRegState(KillSrc));
555  return;
556  }
557 
558  if (RC == &AMDGPU::SReg_64RegClass) {
559  if (DestReg == AMDGPU::VCC) {
560  if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
561  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
562  .addReg(SrcReg, getKillRegState(KillSrc));
563  } else {
564  // FIXME: Hack until VReg_1 removed.
565  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
566  BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
567  .addImm(0)
568  .addReg(SrcReg, getKillRegState(KillSrc));
569  }
570 
571  return;
572  }
573 
574  if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
575  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
576  return;
577  }
578 
579  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
580  .addReg(SrcReg, getKillRegState(KillSrc));
581  return;
582  }
583 
584  if (DestReg == AMDGPU::SCC) {
585  assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
586  BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
587  .addReg(SrcReg, getKillRegState(KillSrc))
588  .addImm(0);
589  return;
590  }
591 
592  if (RC == &AMDGPU::AGPR_32RegClass) {
593  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
594  AMDGPU::SReg_32RegClass.contains(SrcReg) ||
595  AMDGPU::AGPR_32RegClass.contains(SrcReg));
596  if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
597  // First try to find defining accvgpr_write to avoid temporary registers.
598  for (auto Def = MI, E = MBB.begin(); Def != E; ) {
599  --Def;
600  if (!Def->definesRegister(SrcReg, &RI))
601  continue;
602  if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
603  break;
604 
605  MachineOperand &DefOp = Def->getOperand(1);
606  assert(DefOp.isReg() || DefOp.isImm());
607 
608  if (DefOp.isReg()) {
609  // Check that register source operand if not clobbered before MI.
610  // Immediate operands are always safe to propagate.
611  bool SafeToPropagate = true;
612  for (auto I = Def; I != MI && SafeToPropagate; ++I)
613  if (I->modifiesRegister(DefOp.getReg(), &RI))
614  SafeToPropagate = false;
615 
616  if (!SafeToPropagate)
617  break;
618 
619  DefOp.setIsKill(false);
620  }
621 
622  BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
623  .add(DefOp);
624  return;
625  }
626 
627  RegScavenger RS;
628  RS.enterBasicBlock(MBB);
629  RS.forward(MI);
630 
631  // Ideally we want to have three registers for a long reg_sequence copy
632  // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
633  unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
634  *MBB.getParent());
635 
636  // Registers in the sequence are allocated contiguously so we can just
637  // use register number to pick one of three round-robin temps.
638  unsigned RegNo = DestReg % 3;
639  unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
640  if (!Tmp)
641  report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
642  RS.setRegUsed(Tmp);
643  // Only loop through if there are any free registers left, otherwise
644  // scavenger may report a fatal error without emergency spill slot
645  // or spill with the slot.
646  while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
647  unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
648  if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
649  break;
650  Tmp = Tmp2;
651  RS.setRegUsed(Tmp);
652  }
653  copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc);
654  BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
655  .addReg(Tmp, RegState::Kill);
656  return;
657  }
658 
659  BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
660  .addReg(SrcReg, getKillRegState(KillSrc));
661  return;
662  }
663 
664  unsigned EltSize = 4;
665  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
666  if (RI.isSGPRClass(RC)) {
667  // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32.
668  if (!(RI.getRegSizeInBits(*RC) % 64)) {
669  Opcode = AMDGPU::S_MOV_B64;
670  EltSize = 8;
671  } else {
672  Opcode = AMDGPU::S_MOV_B32;
673  EltSize = 4;
674  }
675 
676  if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
677  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
678  return;
679  }
680  } else if (RI.hasAGPRs(RC)) {
681  Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ?
682  AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
683  } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
684  Opcode = AMDGPU::V_ACCVGPR_READ_B32;
685  }
686 
687  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
688  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
689 
690  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
691  unsigned SubIdx;
692  if (Forward)
693  SubIdx = SubIndices[Idx];
694  else
695  SubIdx = SubIndices[SubIndices.size() - Idx - 1];
696 
697  if (Opcode == TargetOpcode::COPY) {
698  copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
699  RI.getSubReg(SrcReg, SubIdx), KillSrc);
700  continue;
701  }
702 
703  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
704  get(Opcode), RI.getSubReg(DestReg, SubIdx));
705 
706  Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
707 
708  if (Idx == 0)
709  Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
710 
711  bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
712  Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
713  }
714 }
715 
716 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
717  int NewOpc;
718 
719  // Try to map original to commuted opcode
720  NewOpc = AMDGPU::getCommuteRev(Opcode);
721  if (NewOpc != -1)
722  // Check if the commuted (REV) opcode exists on the target.
723  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
724 
725  // Try to map commuted to original opcode
726  NewOpc = AMDGPU::getCommuteOrig(Opcode);
727  if (NewOpc != -1)
728  // Check if the original (non-REV) opcode exists on the target.
729  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
730 
731  return Opcode;
732 }
733 
736  const DebugLoc &DL, unsigned DestReg,
737  int64_t Value) const {
739  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
740  if (RegClass == &AMDGPU::SReg_32RegClass ||
741  RegClass == &AMDGPU::SGPR_32RegClass ||
742  RegClass == &AMDGPU::SReg_32_XM0RegClass ||
743  RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
744  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
745  .addImm(Value);
746  return;
747  }
748 
749  if (RegClass == &AMDGPU::SReg_64RegClass ||
750  RegClass == &AMDGPU::SGPR_64RegClass ||
751  RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
752  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
753  .addImm(Value);
754  return;
755  }
756 
757  if (RegClass == &AMDGPU::VGPR_32RegClass) {
758  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
759  .addImm(Value);
760  return;
761  }
762  if (RegClass == &AMDGPU::VReg_64RegClass) {
763  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
764  .addImm(Value);
765  return;
766  }
767 
768  unsigned EltSize = 4;
769  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
770  if (RI.isSGPRClass(RegClass)) {
771  if (RI.getRegSizeInBits(*RegClass) > 32) {
772  Opcode = AMDGPU::S_MOV_B64;
773  EltSize = 8;
774  } else {
775  Opcode = AMDGPU::S_MOV_B32;
776  EltSize = 4;
777  }
778  }
779 
780  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
781  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
782  int64_t IdxValue = Idx == 0 ? Value : 0;
783 
784  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
785  get(Opcode), RI.getSubReg(DestReg, Idx));
786  Builder.addImm(IdxValue);
787  }
788 }
789 
790 const TargetRegisterClass *
792  return &AMDGPU::VGPR_32RegClass;
793 }
794 
797  const DebugLoc &DL, unsigned DstReg,
799  unsigned TrueReg,
800  unsigned FalseReg) const {
802  MachineFunction *MF = MBB.getParent();
803  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
804  const TargetRegisterClass *BoolXExecRC =
805  RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
806  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
807  "Not a VGPR32 reg");
808 
809  if (Cond.size() == 1) {
810  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
811  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
812  .add(Cond[0]);
813  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
814  .addImm(0)
815  .addReg(FalseReg)
816  .addImm(0)
817  .addReg(TrueReg)
818  .addReg(SReg);
819  } else if (Cond.size() == 2) {
820  assert(Cond[0].isImm() && "Cond[0] is not an immediate");
821  switch (Cond[0].getImm()) {
822  case SIInstrInfo::SCC_TRUE: {
823  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
824  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
825  : AMDGPU::S_CSELECT_B64), SReg)
826  .addImm(-1)
827  .addImm(0);
828  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
829  .addImm(0)
830  .addReg(FalseReg)
831  .addImm(0)
832  .addReg(TrueReg)
833  .addReg(SReg);
834  break;
835  }
836  case SIInstrInfo::SCC_FALSE: {
837  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
838  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
839  : AMDGPU::S_CSELECT_B64), SReg)
840  .addImm(0)
841  .addImm(-1);
842  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
843  .addImm(0)
844  .addReg(FalseReg)
845  .addImm(0)
846  .addReg(TrueReg)
847  .addReg(SReg);
848  break;
849  }
850  case SIInstrInfo::VCCNZ: {
851  MachineOperand RegOp = Cond[1];
852  RegOp.setImplicit(false);
853  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
854  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
855  .add(RegOp);
856  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
857  .addImm(0)
858  .addReg(FalseReg)
859  .addImm(0)
860  .addReg(TrueReg)
861  .addReg(SReg);
862  break;
863  }
864  case SIInstrInfo::VCCZ: {
865  MachineOperand RegOp = Cond[1];
866  RegOp.setImplicit(false);
867  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
868  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
869  .add(RegOp);
870  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
871  .addImm(0)
872  .addReg(TrueReg)
873  .addImm(0)
874  .addReg(FalseReg)
875  .addReg(SReg);
876  break;
877  }
878  case SIInstrInfo::EXECNZ: {
879  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
880  Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
881  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
882  : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
883  .addImm(0);
884  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
885  : AMDGPU::S_CSELECT_B64), SReg)
886  .addImm(-1)
887  .addImm(0);
888  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
889  .addImm(0)
890  .addReg(FalseReg)
891  .addImm(0)
892  .addReg(TrueReg)
893  .addReg(SReg);
894  break;
895  }
896  case SIInstrInfo::EXECZ: {
897  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
898  Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
899  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
900  : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
901  .addImm(0);
902  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
903  : AMDGPU::S_CSELECT_B64), SReg)
904  .addImm(0)
905  .addImm(-1);
906  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
907  .addImm(0)
908  .addReg(FalseReg)
909  .addImm(0)
910  .addReg(TrueReg)
911  .addReg(SReg);
912  llvm_unreachable("Unhandled branch predicate EXECZ");
913  break;
914  }
915  default:
916  llvm_unreachable("invalid branch predicate");
917  }
918  } else {
919  llvm_unreachable("Can only handle Cond size 1 or 2");
920  }
921 }
922 
925  const DebugLoc &DL,
926  unsigned SrcReg, int Value) const {
929  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
930  .addImm(Value)
931  .addReg(SrcReg);
932 
933  return Reg;
934 }
935 
938  const DebugLoc &DL,
939  unsigned SrcReg, int Value) const {
942  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
943  .addImm(Value)
944  .addReg(SrcReg);
945 
946  return Reg;
947 }
948 
949 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
950 
951  if (RI.hasAGPRs(DstRC))
952  return AMDGPU::COPY;
953  if (RI.getRegSizeInBits(*DstRC) == 32) {
954  return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
955  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
956  return AMDGPU::S_MOV_B64;
957  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
958  return AMDGPU::V_MOV_B64_PSEUDO;
959  }
960  return AMDGPU::COPY;
961 }
962 
963 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
964  switch (Size) {
965  case 4:
966  return AMDGPU::SI_SPILL_S32_SAVE;
967  case 8:
968  return AMDGPU::SI_SPILL_S64_SAVE;
969  case 12:
970  return AMDGPU::SI_SPILL_S96_SAVE;
971  case 16:
972  return AMDGPU::SI_SPILL_S128_SAVE;
973  case 20:
974  return AMDGPU::SI_SPILL_S160_SAVE;
975  case 32:
976  return AMDGPU::SI_SPILL_S256_SAVE;
977  case 64:
978  return AMDGPU::SI_SPILL_S512_SAVE;
979  case 128:
980  return AMDGPU::SI_SPILL_S1024_SAVE;
981  default:
982  llvm_unreachable("unknown register size");
983  }
984 }
985 
986 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
987  switch (Size) {
988  case 4:
989  return AMDGPU::SI_SPILL_V32_SAVE;
990  case 8:
991  return AMDGPU::SI_SPILL_V64_SAVE;
992  case 12:
993  return AMDGPU::SI_SPILL_V96_SAVE;
994  case 16:
995  return AMDGPU::SI_SPILL_V128_SAVE;
996  case 20:
997  return AMDGPU::SI_SPILL_V160_SAVE;
998  case 32:
999  return AMDGPU::SI_SPILL_V256_SAVE;
1000  case 64:
1001  return AMDGPU::SI_SPILL_V512_SAVE;
1002  case 128:
1003  return AMDGPU::SI_SPILL_V1024_SAVE;
1004  default:
1005  llvm_unreachable("unknown register size");
1006  }
1007 }
1008 
1009 static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1010  switch (Size) {
1011  case 4:
1012  return AMDGPU::SI_SPILL_A32_SAVE;
1013  case 8:
1014  return AMDGPU::SI_SPILL_A64_SAVE;
1015  case 16:
1016  return AMDGPU::SI_SPILL_A128_SAVE;
1017  case 64:
1018  return AMDGPU::SI_SPILL_A512_SAVE;
1019  case 128:
1020  return AMDGPU::SI_SPILL_A1024_SAVE;
1021  default:
1022  llvm_unreachable("unknown register size");
1023  }
1024 }
1025 
1028  unsigned SrcReg, bool isKill,
1029  int FrameIndex,
1030  const TargetRegisterClass *RC,
1031  const TargetRegisterInfo *TRI) const {
1032  MachineFunction *MF = MBB.getParent();
1034  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1035  const DebugLoc &DL = MBB.findDebugLoc(MI);
1036 
1037  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
1038  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
1039  MachinePointerInfo PtrInfo
1040  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1041  MachineMemOperand *MMO
1043  Size, Align);
1044  unsigned SpillSize = TRI->getSpillSize(*RC);
1045 
1046  if (RI.isSGPRClass(RC)) {
1047  MFI->setHasSpilledSGPRs();
1048 
1049  // We are only allowed to create one new instruction when spilling
1050  // registers, so we need to use pseudo instruction for spilling SGPRs.
1051  const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1052 
1053  // The SGPR spill/restore instructions only work on number sgprs, so we need
1054  // to make sure we are using the correct register class.
1055  if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) {
1057  MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
1058  }
1059 
1060  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
1061  .addReg(SrcReg, getKillRegState(isKill)) // data
1062  .addFrameIndex(FrameIndex) // addr
1063  .addMemOperand(MMO)
1065  .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1066  // Add the scratch resource registers as implicit uses because we may end up
1067  // needing them, and need to ensure that the reserved registers are
1068  // correctly handled.
1069  if (RI.spillSGPRToVGPR())
1070  FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1071  if (ST.hasScalarStores()) {
1072  // m0 is used for offset to scalar stores if used to spill.
1073  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
1074  }
1075 
1076  return;
1077  }
1078 
1079  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
1080  : getVGPRSpillSaveOpcode(SpillSize);
1081  MFI->setHasSpilledVGPRs();
1082 
1083  auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
1084  if (RI.hasAGPRs(RC)) {
1086  Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1087  MIB.addReg(Tmp, RegState::Define);
1088  }
1089  MIB.addReg(SrcReg, getKillRegState(isKill)) // data
1090  .addFrameIndex(FrameIndex) // addr
1091  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
1092  .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1093  .addImm(0) // offset
1094  .addMemOperand(MMO);
1095 }
1096 
1097 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1098  switch (Size) {
1099  case 4:
1100  return AMDGPU::SI_SPILL_S32_RESTORE;
1101  case 8:
1102  return AMDGPU::SI_SPILL_S64_RESTORE;
1103  case 12:
1104  return AMDGPU::SI_SPILL_S96_RESTORE;
1105  case 16:
1106  return AMDGPU::SI_SPILL_S128_RESTORE;
1107  case 20:
1108  return AMDGPU::SI_SPILL_S160_RESTORE;
1109  case 32:
1110  return AMDGPU::SI_SPILL_S256_RESTORE;
1111  case 64:
1112  return AMDGPU::SI_SPILL_S512_RESTORE;
1113  case 128:
1114  return AMDGPU::SI_SPILL_S1024_RESTORE;
1115  default:
1116  llvm_unreachable("unknown register size");
1117  }
1118 }
1119 
1120 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1121  switch (Size) {
1122  case 4:
1123  return AMDGPU::SI_SPILL_V32_RESTORE;
1124  case 8:
1125  return AMDGPU::SI_SPILL_V64_RESTORE;
1126  case 12:
1127  return AMDGPU::SI_SPILL_V96_RESTORE;
1128  case 16:
1129  return AMDGPU::SI_SPILL_V128_RESTORE;
1130  case 20:
1131  return AMDGPU::SI_SPILL_V160_RESTORE;
1132  case 32:
1133  return AMDGPU::SI_SPILL_V256_RESTORE;
1134  case 64:
1135  return AMDGPU::SI_SPILL_V512_RESTORE;
1136  case 128:
1137  return AMDGPU::SI_SPILL_V1024_RESTORE;
1138  default:
1139  llvm_unreachable("unknown register size");
1140  }
1141 }
1142 
1143 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1144  switch (Size) {
1145  case 4:
1146  return AMDGPU::SI_SPILL_A32_RESTORE;
1147  case 8:
1148  return AMDGPU::SI_SPILL_A64_RESTORE;
1149  case 16:
1150  return AMDGPU::SI_SPILL_A128_RESTORE;
1151  case 64:
1152  return AMDGPU::SI_SPILL_A512_RESTORE;
1153  case 128:
1154  return AMDGPU::SI_SPILL_A1024_RESTORE;
1155  default:
1156  llvm_unreachable("unknown register size");
1157  }
1158 }
1159 
1162  unsigned DestReg, int FrameIndex,
1163  const TargetRegisterClass *RC,
1164  const TargetRegisterInfo *TRI) const {
1165  MachineFunction *MF = MBB.getParent();
1167  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1168  const DebugLoc &DL = MBB.findDebugLoc(MI);
1169  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
1170  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
1171  unsigned SpillSize = TRI->getSpillSize(*RC);
1172 
1173  MachinePointerInfo PtrInfo
1174  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1175 
1177  PtrInfo, MachineMemOperand::MOLoad, Size, Align);
1178 
1179  if (RI.isSGPRClass(RC)) {
1180  MFI->setHasSpilledSGPRs();
1181 
1182  // FIXME: Maybe this should not include a memoperand because it will be
1183  // lowered to non-memory instructions.
1184  const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1185  if (Register::isVirtualRegister(DestReg) && SpillSize == 4) {
1187  MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
1188  }
1189 
1190  if (RI.spillSGPRToVGPR())
1191  FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1192  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
1193  .addFrameIndex(FrameIndex) // addr
1194  .addMemOperand(MMO)
1196  .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1197 
1198  if (ST.hasScalarStores()) {
1199  // m0 is used for offset to scalar stores if used to spill.
1200  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
1201  }
1202 
1203  return;
1204  }
1205 
1206  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
1207  : getVGPRSpillRestoreOpcode(SpillSize);
1208  auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg);
1209  if (RI.hasAGPRs(RC)) {
1211  Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1212  MIB.addReg(Tmp, RegState::Define);
1213  }
1214  MIB.addFrameIndex(FrameIndex) // vaddr
1215  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
1216  .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1217  .addImm(0) // offset
1218  .addMemOperand(MMO);
1219 }
1220 
1221 /// \param @Offset Offset in bytes of the FrameIndex being spilled
1223  MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
1224  unsigned FrameOffset, unsigned Size) const {
1225  MachineFunction *MF = MBB.getParent();
1227  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1228  const DebugLoc &DL = MBB.findDebugLoc(MI);
1229  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
1230  unsigned WavefrontSize = ST.getWavefrontSize();
1231 
1232  unsigned TIDReg = MFI->getTIDReg();
1233  if (!MFI->hasCalculatedTID()) {
1234  MachineBasicBlock &Entry = MBB.getParent()->front();
1235  MachineBasicBlock::iterator Insert = Entry.front();
1236  const DebugLoc &DL = Insert->getDebugLoc();
1237 
1238  TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
1239  *MF);
1240  if (TIDReg == AMDGPU::NoRegister)
1241  return TIDReg;
1242 
1244  WorkGroupSize > WavefrontSize) {
1245  Register TIDIGXReg =
1247  Register TIDIGYReg =
1249  Register TIDIGZReg =
1251  Register InputPtrReg =
1253  for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
1254  if (!Entry.isLiveIn(Reg))
1255  Entry.addLiveIn(Reg);
1256  }
1257 
1258  RS->enterBasicBlock(Entry);
1259  // FIXME: Can we scavenge an SReg_64 and access the subregs?
1260  unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1261  unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1262  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
1263  .addReg(InputPtrReg)
1265  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
1266  .addReg(InputPtrReg)
1268 
1269  // NGROUPS.X * NGROUPS.Y
1270  BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
1271  .addReg(STmp1)
1272  .addReg(STmp0);
1273  // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1274  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1275  .addReg(STmp1)
1276  .addReg(TIDIGXReg);
1277  // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1278  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1279  .addReg(STmp0)
1280  .addReg(TIDIGYReg)
1281  .addReg(TIDReg);
1282  // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1283  getAddNoCarry(Entry, Insert, DL, TIDReg)
1284  .addReg(TIDReg)
1285  .addReg(TIDIGZReg)
1286  .addImm(0); // clamp bit
1287  } else {
1288  // Get the wave id
1289  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1290  TIDReg)
1291  .addImm(-1)
1292  .addImm(0);
1293 
1294  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1295  TIDReg)
1296  .addImm(-1)
1297  .addReg(TIDReg);
1298  }
1299 
1300  BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1301  TIDReg)
1302  .addImm(2)
1303  .addReg(TIDReg);
1304  MFI->setTIDReg(TIDReg);
1305  }
1306 
1307  // Add FrameIndex to LDS offset
1308  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1309  getAddNoCarry(MBB, MI, DL, TmpReg)
1310  .addImm(LDSOffset)
1311  .addReg(TIDReg)
1312  .addImm(0); // clamp bit
1313 
1314  return TmpReg;
1315 }
1316 
1319  int Count) const {
1320  DebugLoc DL = MBB.findDebugLoc(MI);
1321  while (Count > 0) {
1322  int Arg;
1323  if (Count >= 8)
1324  Arg = 7;
1325  else
1326  Arg = Count - 1;
1327  Count -= 8;
1328  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1329  .addImm(Arg);
1330  }
1331 }
1332 
1335  insertWaitStates(MBB, MI, 1);
1336 }
1337 
1339  auto MF = MBB.getParent();
1341 
1342  assert(Info->isEntryFunction());
1343 
1344  if (MBB.succ_empty()) {
1345  bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1346  if (HasNoTerminator) {
1347  if (Info->returnsVoid()) {
1348  BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1349  } else {
1350  BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1351  }
1352  }
1353  }
1354 }
1355 
1357  switch (MI.getOpcode()) {
1358  default: return 1; // FIXME: Do wait states equal cycles?
1359 
1360  case AMDGPU::S_NOP:
1361  return MI.getOperand(0).getImm() + 1;
1362  }
1363 }
1364 
1366  MachineBasicBlock &MBB = *MI.getParent();
1367  DebugLoc DL = MBB.findDebugLoc(MI);
1368  switch (MI.getOpcode()) {
1369  default: return TargetInstrInfo::expandPostRAPseudo(MI);
1370  case AMDGPU::S_MOV_B64_term:
1371  // This is only a terminator to get the correct spill code placement during
1372  // register allocation.
1373  MI.setDesc(get(AMDGPU::S_MOV_B64));
1374  break;
1375 
1376  case AMDGPU::S_MOV_B32_term:
1377  // This is only a terminator to get the correct spill code placement during
1378  // register allocation.
1379  MI.setDesc(get(AMDGPU::S_MOV_B32));
1380  break;
1381 
1382  case AMDGPU::S_XOR_B64_term:
1383  // This is only a terminator to get the correct spill code placement during
1384  // register allocation.
1385  MI.setDesc(get(AMDGPU::S_XOR_B64));
1386  break;
1387 
1388  case AMDGPU::S_XOR_B32_term:
1389  // This is only a terminator to get the correct spill code placement during
1390  // register allocation.
1391  MI.setDesc(get(AMDGPU::S_XOR_B32));
1392  break;
1393 
1394  case AMDGPU::S_OR_B32_term:
1395  // This is only a terminator to get the correct spill code placement during
1396  // register allocation.
1397  MI.setDesc(get(AMDGPU::S_OR_B32));
1398  break;
1399 
1400  case AMDGPU::S_OR_B64_term:
1401  // This is only a terminator to get the correct spill code placement during
1402  // register allocation.
1403  MI.setDesc(get(AMDGPU::S_OR_B64));
1404  break;
1405 
1406  case AMDGPU::S_ANDN2_B64_term:
1407  // This is only a terminator to get the correct spill code placement during
1408  // register allocation.
1409  MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1410  break;
1411 
1412  case AMDGPU::S_ANDN2_B32_term:
1413  // This is only a terminator to get the correct spill code placement during
1414  // register allocation.
1415  MI.setDesc(get(AMDGPU::S_ANDN2_B32));
1416  break;
1417 
1418  case AMDGPU::V_MOV_B64_PSEUDO: {
1419  Register Dst = MI.getOperand(0).getReg();
1420  Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1421  Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1422 
1423  const MachineOperand &SrcOp = MI.getOperand(1);
1424  // FIXME: Will this work for 64-bit floating point immediates?
1425  assert(!SrcOp.isFPImm());
1426  if (SrcOp.isImm()) {
1427  APInt Imm(64, SrcOp.getImm());
1428  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1429  .addImm(Imm.getLoBits(32).getZExtValue())
1430  .addReg(Dst, RegState::Implicit | RegState::Define);
1431  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1432  .addImm(Imm.getHiBits(32).getZExtValue())
1433  .addReg(Dst, RegState::Implicit | RegState::Define);
1434  } else {
1435  assert(SrcOp.isReg());
1436  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1437  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1439  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1440  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1442  }
1443  MI.eraseFromParent();
1444  break;
1445  }
1446  case AMDGPU::V_SET_INACTIVE_B32: {
1447  unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
1448  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1449  BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1450  .addReg(Exec);
1451  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1452  .add(MI.getOperand(2));
1453  BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1454  .addReg(Exec);
1455  MI.eraseFromParent();
1456  break;
1457  }
1458  case AMDGPU::V_SET_INACTIVE_B64: {
1459  unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
1460  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1461  BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1462  .addReg(Exec);
1463  MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1464  MI.getOperand(0).getReg())
1465  .add(MI.getOperand(2));
1466  expandPostRAPseudo(*Copy);
1467  BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1468  .addReg(Exec);
1469  MI.eraseFromParent();
1470  break;
1471  }
1472  case AMDGPU::V_MOVRELD_B32_V1:
1473  case AMDGPU::V_MOVRELD_B32_V2:
1474  case AMDGPU::V_MOVRELD_B32_V4:
1475  case AMDGPU::V_MOVRELD_B32_V8:
1476  case AMDGPU::V_MOVRELD_B32_V16: {
1477  const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1478  Register VecReg = MI.getOperand(0).getReg();
1479  bool IsUndef = MI.getOperand(1).isUndef();
1480  unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1481  assert(VecReg == MI.getOperand(1).getReg());
1482 
1483  MachineInstr *MovRel =
1484  BuildMI(MBB, MI, DL, MovRelDesc)
1485  .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1486  .add(MI.getOperand(2))
1487  .addReg(VecReg, RegState::ImplicitDefine)
1488  .addReg(VecReg,
1489  RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1490 
1491  const int ImpDefIdx =
1492  MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1493  const int ImpUseIdx = ImpDefIdx + 1;
1494  MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1495 
1496  MI.eraseFromParent();
1497  break;
1498  }
1499  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1500  MachineFunction &MF = *MBB.getParent();
1501  Register Reg = MI.getOperand(0).getReg();
1502  Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1503  Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1504 
1505  // Create a bundle so these instructions won't be re-ordered by the
1506  // post-RA scheduler.
1507  MIBundleBuilder Bundler(MBB, MI);
1508  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1509 
1510  // Add 32-bit offset from this instruction to the start of the
1511  // constant data.
1512  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1513  .addReg(RegLo)
1514  .add(MI.getOperand(1)));
1515 
1516  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1517  .addReg(RegHi);
1518  MIB.add(MI.getOperand(2));
1519 
1520  Bundler.append(MIB);
1521  finalizeBundle(MBB, Bundler.begin());
1522 
1523  MI.eraseFromParent();
1524  break;
1525  }
1526  case AMDGPU::ENTER_WWM: {
1527  // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1528  // WWM is entered.
1529  MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1530  : AMDGPU::S_OR_SAVEEXEC_B64));
1531  break;
1532  }
1533  case AMDGPU::EXIT_WWM: {
1534  // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1535  // WWM is exited.
1536  MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
1537  break;
1538  }
1539  case TargetOpcode::BUNDLE: {
1540  if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
1541  return false;
1542 
1543  // If it is a load it must be a memory clause
1545  I->isBundledWithSucc(); ++I) {
1546  I->unbundleFromSucc();
1547  for (MachineOperand &MO : I->operands())
1548  if (MO.isReg())
1549  MO.setIsInternalRead(false);
1550  }
1551 
1552  MI.eraseFromParent();
1553  break;
1554  }
1555  }
1556  return true;
1557 }
1558 
1560  MachineOperand &Src0,
1561  unsigned Src0OpName,
1562  MachineOperand &Src1,
1563  unsigned Src1OpName) const {
1564  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1565  if (!Src0Mods)
1566  return false;
1567 
1568  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1569  assert(Src1Mods &&
1570  "All commutable instructions have both src0 and src1 modifiers");
1571 
1572  int Src0ModsVal = Src0Mods->getImm();
1573  int Src1ModsVal = Src1Mods->getImm();
1574 
1575  Src1Mods->setImm(Src0ModsVal);
1576  Src0Mods->setImm(Src1ModsVal);
1577  return true;
1578 }
1579 
1581  MachineOperand &RegOp,
1582  MachineOperand &NonRegOp) {
1583  Register Reg = RegOp.getReg();
1584  unsigned SubReg = RegOp.getSubReg();
1585  bool IsKill = RegOp.isKill();
1586  bool IsDead = RegOp.isDead();
1587  bool IsUndef = RegOp.isUndef();
1588  bool IsDebug = RegOp.isDebug();
1589 
1590  if (NonRegOp.isImm())
1591  RegOp.ChangeToImmediate(NonRegOp.getImm());
1592  else if (NonRegOp.isFI())
1593  RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1594  else
1595  return nullptr;
1596 
1597  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1598  NonRegOp.setSubReg(SubReg);
1599 
1600  return &MI;
1601 }
1602 
1604  unsigned Src0Idx,
1605  unsigned Src1Idx) const {
1606  assert(!NewMI && "this should never be used");
1607 
1608  unsigned Opc = MI.getOpcode();
1609  int CommutedOpcode = commuteOpcode(Opc);
1610  if (CommutedOpcode == -1)
1611  return nullptr;
1612 
1613  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1614  static_cast<int>(Src0Idx) &&
1615  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1616  static_cast<int>(Src1Idx) &&
1617  "inconsistency with findCommutedOpIndices");
1618 
1619  MachineOperand &Src0 = MI.getOperand(Src0Idx);
1620  MachineOperand &Src1 = MI.getOperand(Src1Idx);
1621 
1622  MachineInstr *CommutedMI = nullptr;
1623  if (Src0.isReg() && Src1.isReg()) {
1624  if (isOperandLegal(MI, Src1Idx, &Src0)) {
1625  // Be sure to copy the source modifiers to the right place.
1626  CommutedMI
1627  = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1628  }
1629 
1630  } else if (Src0.isReg() && !Src1.isReg()) {
1631  // src0 should always be able to support any operand type, so no need to
1632  // check operand legality.
1633  CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1634  } else if (!Src0.isReg() && Src1.isReg()) {
1635  if (isOperandLegal(MI, Src1Idx, &Src0))
1636  CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1637  } else {
1638  // FIXME: Found two non registers to commute. This does happen.
1639  return nullptr;
1640  }
1641 
1642  if (CommutedMI) {
1643  swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1644  Src1, AMDGPU::OpName::src1_modifiers);
1645 
1646  CommutedMI->setDesc(get(CommutedOpcode));
1647  }
1648 
1649  return CommutedMI;
1650 }
1651 
1652 // This needs to be implemented because the source modifiers may be inserted
1653 // between the true commutable operands, and the base
1654 // TargetInstrInfo::commuteInstruction uses it.
1656  unsigned &SrcOpIdx1) const {
1657  return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
1658 }
1659 
1660 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
1661  unsigned &SrcOpIdx1) const {
1662  if (!Desc.isCommutable())
1663  return false;
1664 
1665  unsigned Opc = Desc.getOpcode();
1666  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1667  if (Src0Idx == -1)
1668  return false;
1669 
1670  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1671  if (Src1Idx == -1)
1672  return false;
1673 
1674  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1675 }
1676 
1677 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1678  int64_t BrOffset) const {
1679  // BranchRelaxation should never have to check s_setpc_b64 because its dest
1680  // block is unanalyzable.
1681  assert(BranchOp != AMDGPU::S_SETPC_B64);
1682 
1683  // Convert to dwords.
1684  BrOffset /= 4;
1685 
1686  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1687  // from the next instruction.
1688  BrOffset -= 1;
1689 
1690  return isIntN(BranchOffsetBits, BrOffset);
1691 }
1692 
1694  const MachineInstr &MI) const {
1695  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1696  // This would be a difficult analysis to perform, but can always be legal so
1697  // there's no need to analyze it.
1698  return nullptr;
1699  }
1700 
1701  return MI.getOperand(0).getMBB();
1702 }
1703 
1705  MachineBasicBlock &DestBB,
1706  const DebugLoc &DL,
1707  int64_t BrOffset,
1708  RegScavenger *RS) const {
1709  assert(RS && "RegScavenger required for long branching");
1710  assert(MBB.empty() &&
1711  "new block should be inserted for expanding unconditional branch");
1712  assert(MBB.pred_size() == 1);
1713 
1714  MachineFunction *MF = MBB.getParent();
1715  MachineRegisterInfo &MRI = MF->getRegInfo();
1716 
1717  // FIXME: Virtual register workaround for RegScavenger not working with empty
1718  // blocks.
1719  Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1720 
1721  auto I = MBB.end();
1722 
1723  // We need to compute the offset relative to the instruction immediately after
1724  // s_getpc_b64. Insert pc arithmetic code before last terminator.
1725  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1726 
1727  // TODO: Handle > 32-bit block address.
1728  if (BrOffset >= 0) {
1729  BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1730  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1731  .addReg(PCReg, 0, AMDGPU::sub0)
1732  .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD);
1733  BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1734  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1735  .addReg(PCReg, 0, AMDGPU::sub1)
1736  .addImm(0);
1737  } else {
1738  // Backwards branch.
1739  BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1740  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1741  .addReg(PCReg, 0, AMDGPU::sub0)
1742  .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD);
1743  BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1744  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1745  .addReg(PCReg, 0, AMDGPU::sub1)
1746  .addImm(0);
1747  }
1748 
1749  // Insert the indirect branch after the other terminator.
1750  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1751  .addReg(PCReg);
1752 
1753  // FIXME: If spilling is necessary, this will fail because this scavenger has
1754  // no emergency stack slots. It is non-trivial to spill in this situation,
1755  // because the restore code needs to be specially placed after the
1756  // jump. BranchRelaxation then needs to be made aware of the newly inserted
1757  // block.
1758  //
1759  // If a spill is needed for the pc register pair, we need to insert a spill
1760  // restore block right before the destination block, and insert a short branch
1761  // into the old destination block's fallthrough predecessor.
1762  // e.g.:
1763  //
1764  // s_cbranch_scc0 skip_long_branch:
1765  //
1766  // long_branch_bb:
1767  // spill s[8:9]
1768  // s_getpc_b64 s[8:9]
1769  // s_add_u32 s8, s8, restore_bb
1770  // s_addc_u32 s9, s9, 0
1771  // s_setpc_b64 s[8:9]
1772  //
1773  // skip_long_branch:
1774  // foo;
1775  //
1776  // .....
1777  //
1778  // dest_bb_fallthrough_predecessor:
1779  // bar;
1780  // s_branch dest_bb
1781  //
1782  // restore_bb:
1783  // restore s[8:9]
1784  // fallthrough dest_bb
1785  ///
1786  // dest_bb:
1787  // buzz;
1788 
1789  RS->enterBasicBlockEnd(MBB);
1790  unsigned Scav = RS->scavengeRegisterBackwards(
1791  AMDGPU::SReg_64RegClass,
1792  MachineBasicBlock::iterator(GetPC), false, 0);
1793  MRI.replaceRegWith(PCReg, Scav);
1794  MRI.clearVirtRegs();
1795  RS->setRegUsed(Scav);
1796 
1797  return 4 + 8 + 4 + 4;
1798 }
1799 
1800 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1801  switch (Cond) {
1802  case SIInstrInfo::SCC_TRUE:
1803  return AMDGPU::S_CBRANCH_SCC1;
1804  case SIInstrInfo::SCC_FALSE:
1805  return AMDGPU::S_CBRANCH_SCC0;
1806  case SIInstrInfo::VCCNZ:
1807  return AMDGPU::S_CBRANCH_VCCNZ;
1808  case SIInstrInfo::VCCZ:
1809  return AMDGPU::S_CBRANCH_VCCZ;
1810  case SIInstrInfo::EXECNZ:
1811  return AMDGPU::S_CBRANCH_EXECNZ;
1812  case SIInstrInfo::EXECZ:
1813  return AMDGPU::S_CBRANCH_EXECZ;
1814  default:
1815  llvm_unreachable("invalid branch predicate");
1816  }
1817 }
1818 
1819 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1820  switch (Opcode) {
1821  case AMDGPU::S_CBRANCH_SCC0:
1822  return SCC_FALSE;
1823  case AMDGPU::S_CBRANCH_SCC1:
1824  return SCC_TRUE;
1825  case AMDGPU::S_CBRANCH_VCCNZ:
1826  return VCCNZ;
1827  case AMDGPU::S_CBRANCH_VCCZ:
1828  return VCCZ;
1829  case AMDGPU::S_CBRANCH_EXECNZ:
1830  return EXECNZ;
1831  case AMDGPU::S_CBRANCH_EXECZ:
1832  return EXECZ;
1833  default:
1834  return INVALID_BR;
1835  }
1836 }
1837 
1840  MachineBasicBlock *&TBB,
1841  MachineBasicBlock *&FBB,
1843  bool AllowModify) const {
1844  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1845  // Unconditional Branch
1846  TBB = I->getOperand(0).getMBB();
1847  return false;
1848  }
1849 
1850  MachineBasicBlock *CondBB = nullptr;
1851 
1852  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1853  CondBB = I->getOperand(1).getMBB();
1854  Cond.push_back(I->getOperand(0));
1855  } else {
1856  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1857  if (Pred == INVALID_BR)
1858  return true;
1859 
1860  CondBB = I->getOperand(0).getMBB();
1862  Cond.push_back(I->getOperand(1)); // Save the branch register.
1863  }
1864  ++I;
1865 
1866  if (I == MBB.end()) {
1867  // Conditional branch followed by fall-through.
1868  TBB = CondBB;
1869  return false;
1870  }
1871 
1872  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1873  TBB = CondBB;
1874  FBB = I->getOperand(0).getMBB();
1875  return false;
1876  }
1877 
1878  return true;
1879 }
1880 
1882  MachineBasicBlock *&FBB,
1884  bool AllowModify) const {
1886  auto E = MBB.end();
1887  if (I == E)
1888  return false;
1889 
1890  // Skip over the instructions that are artificially terminators for special
1891  // exec management.
1892  while (I != E && !I->isBranch() && !I->isReturn() &&
1893  I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
1894  switch (I->getOpcode()) {
1895  case AMDGPU::SI_MASK_BRANCH:
1896  case AMDGPU::S_MOV_B64_term:
1897  case AMDGPU::S_XOR_B64_term:
1898  case AMDGPU::S_OR_B64_term:
1899  case AMDGPU::S_ANDN2_B64_term:
1900  case AMDGPU::S_MOV_B32_term:
1901  case AMDGPU::S_XOR_B32_term:
1902  case AMDGPU::S_OR_B32_term:
1903  case AMDGPU::S_ANDN2_B32_term:
1904  break;
1905  case AMDGPU::SI_IF:
1906  case AMDGPU::SI_ELSE:
1907  case AMDGPU::SI_KILL_I1_TERMINATOR:
1908  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1909  // FIXME: It's messy that these need to be considered here at all.
1910  return true;
1911  default:
1912  llvm_unreachable("unexpected non-branch terminator inst");
1913  }
1914 
1915  ++I;
1916  }
1917 
1918  if (I == E)
1919  return false;
1920 
1921  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1922  return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1923 
1924  ++I;
1925 
1926  // TODO: Should be able to treat as fallthrough?
1927  if (I == MBB.end())
1928  return true;
1929 
1930  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1931  return true;
1932 
1933  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1934 
1935  // Specifically handle the case where the conditional branch is to the same
1936  // destination as the mask branch. e.g.
1937  //
1938  // si_mask_branch BB8
1939  // s_cbranch_execz BB8
1940  // s_cbranch BB9
1941  //
1942  // This is required to understand divergent loops which may need the branches
1943  // to be relaxed.
1944  if (TBB != MaskBrDest || Cond.empty())
1945  return true;
1946 
1947  auto Pred = Cond[0].getImm();
1948  return (Pred != EXECZ && Pred != EXECNZ);
1949 }
1950 
1952  int *BytesRemoved) const {
1954 
1955  unsigned Count = 0;
1956  unsigned RemovedSize = 0;
1957  while (I != MBB.end()) {
1958  MachineBasicBlock::iterator Next = std::next(I);
1959  if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1960  I = Next;
1961  continue;
1962  }
1963 
1964  RemovedSize += getInstSizeInBytes(*I);
1965  I->eraseFromParent();
1966  ++Count;
1967  I = Next;
1968  }
1969 
1970  if (BytesRemoved)
1971  *BytesRemoved = RemovedSize;
1972 
1973  return Count;
1974 }
1975 
1976 // Copy the flags onto the implicit condition register operand.
1978  const MachineOperand &OrigCond) {
1979  CondReg.setIsUndef(OrigCond.isUndef());
1980  CondReg.setIsKill(OrigCond.isKill());
1981 }
1982 
1984  MachineBasicBlock *TBB,
1985  MachineBasicBlock *FBB,
1987  const DebugLoc &DL,
1988  int *BytesAdded) const {
1989  if (!FBB && Cond.empty()) {
1990  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1991  .addMBB(TBB);
1992  if (BytesAdded)
1993  *BytesAdded = 4;
1994  return 1;
1995  }
1996 
1997  if(Cond.size() == 1 && Cond[0].isReg()) {
1998  BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1999  .add(Cond[0])
2000  .addMBB(TBB);
2001  return 1;
2002  }
2003 
2004  assert(TBB && Cond[0].isImm());
2005 
2006  unsigned Opcode
2007  = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
2008 
2009  if (!FBB) {
2010  Cond[1].isUndef();
2011  MachineInstr *CondBr =
2012  BuildMI(&MBB, DL, get(Opcode))
2013  .addMBB(TBB);
2014 
2015  // Copy the flags onto the implicit condition register operand.
2016  preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
2017 
2018  if (BytesAdded)
2019  *BytesAdded = 4;
2020  return 1;
2021  }
2022 
2023  assert(TBB && FBB);
2024 
2025  MachineInstr *CondBr =
2026  BuildMI(&MBB, DL, get(Opcode))
2027  .addMBB(TBB);
2028  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2029  .addMBB(FBB);
2030 
2031  MachineOperand &CondReg = CondBr->getOperand(1);
2032  CondReg.setIsUndef(Cond[1].isUndef());
2033  CondReg.setIsKill(Cond[1].isKill());
2034 
2035  if (BytesAdded)
2036  *BytesAdded = 8;
2037 
2038  return 2;
2039 }
2040 
2042  SmallVectorImpl<MachineOperand> &Cond) const {
2043  if (Cond.size() != 2) {
2044  return true;
2045  }
2046 
2047  if (Cond[0].isImm()) {
2048  Cond[0].setImm(-Cond[0].getImm());
2049  return false;
2050  }
2051 
2052  return true;
2053 }
2054 
2057  unsigned TrueReg, unsigned FalseReg,
2058  int &CondCycles,
2059  int &TrueCycles, int &FalseCycles) const {
2060  switch (Cond[0].getImm()) {
2061  case VCCNZ:
2062  case VCCZ: {
2063  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2064  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2065  assert(MRI.getRegClass(FalseReg) == RC);
2066 
2067  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
2068  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2069 
2070  // Limit to equal cost for branch vs. N v_cndmask_b32s.
2071  return RI.hasVGPRs(RC) && NumInsts <= 6;
2072  }
2073  case SCC_TRUE:
2074  case SCC_FALSE: {
2075  // FIXME: We could insert for VGPRs if we could replace the original compare
2076  // with a vector one.
2077  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2078  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2079  assert(MRI.getRegClass(FalseReg) == RC);
2080 
2081  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
2082 
2083  // Multiples of 8 can do s_cselect_b64
2084  if (NumInsts % 2 == 0)
2085  NumInsts /= 2;
2086 
2087  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2088  return RI.isSGPRClass(RC);
2089  }
2090  default:
2091  return false;
2092  }
2093 }
2094 
2097  unsigned DstReg, ArrayRef<MachineOperand> Cond,
2098  unsigned TrueReg, unsigned FalseReg) const {
2099  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
2100  if (Pred == VCCZ || Pred == SCC_FALSE) {
2101  Pred = static_cast<BranchPredicate>(-Pred);
2102  std::swap(TrueReg, FalseReg);
2103  }
2104 
2106  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
2107  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
2108 
2109  if (DstSize == 32) {
2110  unsigned SelOp = Pred == SCC_TRUE ?
2111  AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
2112 
2113  // Instruction's operands are backwards from what is expected.
2114  MachineInstr *Select =
2115  BuildMI(MBB, I, DL, get(SelOp), DstReg)
2116  .addReg(FalseReg)
2117  .addReg(TrueReg);
2118 
2119  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2120  return;
2121  }
2122 
2123  if (DstSize == 64 && Pred == SCC_TRUE) {
2124  MachineInstr *Select =
2125  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
2126  .addReg(FalseReg)
2127  .addReg(TrueReg);
2128 
2129  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2130  return;
2131  }
2132 
2133  static const int16_t Sub0_15[] = {
2134  AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
2135  AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
2136  AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
2137  AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
2138  };
2139 
2140  static const int16_t Sub0_15_64[] = {
2141  AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
2142  AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
2143  AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
2144  AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
2145  };
2146 
2147  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
2148  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
2149  const int16_t *SubIndices = Sub0_15;
2150  int NElts = DstSize / 32;
2151 
2152  // 64-bit select is only available for SALU.
2153  // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
2154  if (Pred == SCC_TRUE) {
2155  if (NElts % 2) {
2156  SelOp = AMDGPU::S_CSELECT_B32;
2157  EltRC = &AMDGPU::SGPR_32RegClass;
2158  } else {
2159  SelOp = AMDGPU::S_CSELECT_B64;
2160  EltRC = &AMDGPU::SGPR_64RegClass;
2161  SubIndices = Sub0_15_64;
2162  NElts /= 2;
2163  }
2164  }
2165 
2167  MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
2168 
2169  I = MIB->getIterator();
2170 
2172  for (int Idx = 0; Idx != NElts; ++Idx) {
2173  Register DstElt = MRI.createVirtualRegister(EltRC);
2174  Regs.push_back(DstElt);
2175 
2176  unsigned SubIdx = SubIndices[Idx];
2177 
2178  MachineInstr *Select =
2179  BuildMI(MBB, I, DL, get(SelOp), DstElt)
2180  .addReg(FalseReg, 0, SubIdx)
2181  .addReg(TrueReg, 0, SubIdx);
2182  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2183  fixImplicitOperands(*Select);
2184 
2185  MIB.addReg(DstElt)
2186  .addImm(SubIdx);
2187  }
2188 }
2189 
2191  switch (MI.getOpcode()) {
2192  case AMDGPU::V_MOV_B32_e32:
2193  case AMDGPU::V_MOV_B32_e64:
2194  case AMDGPU::V_MOV_B64_PSEUDO: {
2195  // If there are additional implicit register operands, this may be used for
2196  // register indexing so the source register operand isn't simply copied.
2197  unsigned NumOps = MI.getDesc().getNumOperands() +
2198  MI.getDesc().getNumImplicitUses();
2199 
2200  return MI.getNumOperands() == NumOps;
2201  }
2202  case AMDGPU::S_MOV_B32:
2203  case AMDGPU::S_MOV_B64:
2204  case AMDGPU::COPY:
2205  case AMDGPU::V_ACCVGPR_WRITE_B32:
2206  case AMDGPU::V_ACCVGPR_READ_B32:
2207  return true;
2208  default:
2209  return false;
2210  }
2211 }
2212 
2214  unsigned Kind) const {
2215  switch(Kind) {
2226  }
2227  return AMDGPUAS::FLAT_ADDRESS;
2228 }
2229 
2231  unsigned Opc = MI.getOpcode();
2232  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2233  AMDGPU::OpName::src0_modifiers);
2234  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2235  AMDGPU::OpName::src1_modifiers);
2236  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2237  AMDGPU::OpName::src2_modifiers);
2238 
2239  MI.RemoveOperand(Src2ModIdx);
2240  MI.RemoveOperand(Src1ModIdx);
2241  MI.RemoveOperand(Src0ModIdx);
2242 }
2243 
2245  unsigned Reg, MachineRegisterInfo *MRI) const {
2246  if (!MRI->hasOneNonDBGUse(Reg))
2247  return false;
2248 
2249  switch (DefMI.getOpcode()) {
2250  default:
2251  return false;
2252  case AMDGPU::S_MOV_B64:
2253  // TODO: We could fold 64-bit immediates, but this get compilicated
2254  // when there are sub-registers.
2255  return false;
2256 
2257  case AMDGPU::V_MOV_B32_e32:
2258  case AMDGPU::S_MOV_B32:
2259  case AMDGPU::V_ACCVGPR_WRITE_B32:
2260  break;
2261  }
2262 
2263  const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
2264  assert(ImmOp);
2265  // FIXME: We could handle FrameIndex values here.
2266  if (!ImmOp->isImm())
2267  return false;
2268 
2269  unsigned Opc = UseMI.getOpcode();
2270  if (Opc == AMDGPU::COPY) {
2271  bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
2272  unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2273  if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) {
2275  return false;
2276  NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32;
2277  }
2278  UseMI.setDesc(get(NewOpc));
2279  UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
2280  UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2281  return true;
2282  }
2283 
2284  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2285  Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 ||
2286  Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2287  Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) {
2288  // Don't fold if we are using source or output modifiers. The new VOP2
2289  // instructions don't have them.
2290  if (hasAnyModifiersSet(UseMI))
2291  return false;
2292 
2293  // If this is a free constant, there's no reason to do this.
2294  // TODO: We could fold this here instead of letting SIFoldOperands do it
2295  // later.
2296  MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2297 
2298  // Any src operand can be used for the legality check.
2299  if (isInlineConstant(UseMI, *Src0, *ImmOp))
2300  return false;
2301 
2302  bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2303  Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64;
2304  bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2305  Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64;
2306  MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2307  MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2308 
2309  // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2310  // We should only expect these to be on src0 due to canonicalizations.
2311  if (Src0->isReg() && Src0->getReg() == Reg) {
2312  if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2313  return false;
2314 
2315  if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2316  return false;
2317 
2318  unsigned NewOpc =
2319  IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
2320  : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
2321  if (pseudoToMCOpcode(NewOpc) == -1)
2322  return false;
2323 
2324  // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2325 
2326  const int64_t Imm = ImmOp->getImm();
2327 
2328  // FIXME: This would be a lot easier if we could return a new instruction
2329  // instead of having to modify in place.
2330 
2331  // Remove these first since they are at the end.
2332  UseMI.RemoveOperand(
2333  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2334  UseMI.RemoveOperand(
2335  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2336 
2337  Register Src1Reg = Src1->getReg();
2338  unsigned Src1SubReg = Src1->getSubReg();
2339  Src0->setReg(Src1Reg);
2340  Src0->setSubReg(Src1SubReg);
2341  Src0->setIsKill(Src1->isKill());
2342 
2343  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2344  Opc == AMDGPU::V_MAC_F16_e64 ||
2345  Opc == AMDGPU::V_FMAC_F32_e64 ||
2346  Opc == AMDGPU::V_FMAC_F16_e64)
2347  UseMI.untieRegOperand(
2348  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2349 
2350  Src1->ChangeToImmediate(Imm);
2351 
2352  removeModOperands(UseMI);
2353  UseMI.setDesc(get(NewOpc));
2354 
2355  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2356  if (DeleteDef)
2357  DefMI.eraseFromParent();
2358 
2359  return true;
2360  }
2361 
2362  // Added part is the constant: Use v_madak_{f16, f32}.
2363  if (Src2->isReg() && Src2->getReg() == Reg) {
2364  // Not allowed to use constant bus for another operand.
2365  // We can however allow an inline immediate as src0.
2366  bool Src0Inlined = false;
2367  if (Src0->isReg()) {
2368  // Try to inline constant if possible.
2369  // If the Def moves immediate and the use is single
2370  // We are saving VGPR here.
2371  MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
2372  if (Def && Def->isMoveImmediate() &&
2373  isInlineConstant(Def->getOperand(1)) &&
2374  MRI->hasOneUse(Src0->getReg())) {
2375  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2376  Src0Inlined = true;
2377  } else if ((Register::isPhysicalRegister(Src0->getReg()) &&
2378  (ST.getConstantBusLimit(Opc) <= 1 &&
2379  RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
2380  (Register::isVirtualRegister(Src0->getReg()) &&
2381  (ST.getConstantBusLimit(Opc) <= 1 &&
2382  RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
2383  return false;
2384  // VGPR is okay as Src0 - fallthrough
2385  }
2386 
2387  if (Src1->isReg() && !Src0Inlined ) {
2388  // We have one slot for inlinable constant so far - try to fill it
2389  MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
2390  if (Def && Def->isMoveImmediate() &&
2391  isInlineConstant(Def->getOperand(1)) &&
2392  MRI->hasOneUse(Src1->getReg()) &&
2393  commuteInstruction(UseMI)) {
2394  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2395  } else if ((Register::isPhysicalRegister(Src1->getReg()) &&
2396  RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
2397  (Register::isVirtualRegister(Src1->getReg()) &&
2398  RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
2399  return false;
2400  // VGPR is okay as Src1 - fallthrough
2401  }
2402 
2403  unsigned NewOpc =
2404  IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
2405  : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
2406  if (pseudoToMCOpcode(NewOpc) == -1)
2407  return false;
2408 
2409  const int64_t Imm = ImmOp->getImm();
2410 
2411  // FIXME: This would be a lot easier if we could return a new instruction
2412  // instead of having to modify in place.
2413 
2414  // Remove these first since they are at the end.
2415  UseMI.RemoveOperand(
2416  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2417  UseMI.RemoveOperand(
2418  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2419 
2420  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2421  Opc == AMDGPU::V_MAC_F16_e64 ||
2422  Opc == AMDGPU::V_FMAC_F32_e64 ||
2423  Opc == AMDGPU::V_FMAC_F16_e64)
2424  UseMI.untieRegOperand(
2425  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2426 
2427  // ChangingToImmediate adds Src2 back to the instruction.
2428  Src2->ChangeToImmediate(Imm);
2429 
2430  // These come before src2.
2431  removeModOperands(UseMI);
2432  UseMI.setDesc(get(NewOpc));
2433  // It might happen that UseMI was commuted
2434  // and we now have SGPR as SRC1. If so 2 inlined
2435  // constant and SGPR are illegal.
2436  legalizeOperands(UseMI);
2437 
2438  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2439  if (DeleteDef)
2440  DefMI.eraseFromParent();
2441 
2442  return true;
2443  }
2444  }
2445 
2446  return false;
2447 }
2448 
2449 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2450  int WidthB, int OffsetB) {
2451  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2452  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2453  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2454  return LowOffset + LowWidth <= HighOffset;
2455 }
2456 
2457 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
2458  const MachineInstr &MIb) const {
2459  const MachineOperand *BaseOp0, *BaseOp1;
2460  int64_t Offset0, Offset1;
2461 
2462  if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
2463  getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
2464  if (!BaseOp0->isIdenticalTo(*BaseOp1))
2465  return false;
2466 
2467  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2468  // FIXME: Handle ds_read2 / ds_write2.
2469  return false;
2470  }
2471  unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2472  unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2473  if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
2474  return true;
2475  }
2476  }
2477 
2478  return false;
2479 }
2480 
2482  const MachineInstr &MIb,
2483  AliasAnalysis *AA) const {
2484  assert((MIa.mayLoad() || MIa.mayStore()) &&
2485  "MIa must load from or modify a memory location");
2486  assert((MIb.mayLoad() || MIb.mayStore()) &&
2487  "MIb must load from or modify a memory location");
2488 
2490  return false;
2491 
2492  // XXX - Can we relax this between address spaces?
2493  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2494  return false;
2495 
2496  // TODO: Should we check the address space from the MachineMemOperand? That
2497  // would allow us to distinguish objects we know don't alias based on the
2498  // underlying address space, even if it was lowered to a different one,
2499  // e.g. private accesses lowered to use MUBUF instructions on a scratch
2500  // buffer.
2501  if (isDS(MIa)) {
2502  if (isDS(MIb))
2503  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2504 
2505  return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
2506  }
2507 
2508  if (isMUBUF(MIa) || isMTBUF(MIa)) {
2509  if (isMUBUF(MIb) || isMTBUF(MIb))
2510  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2511 
2512  return !isFLAT(MIb) && !isSMRD(MIb);
2513  }
2514 
2515  if (isSMRD(MIa)) {
2516  if (isSMRD(MIb))
2517  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2518 
2519  return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2520  }
2521 
2522  if (isFLAT(MIa)) {
2523  if (isFLAT(MIb))
2524  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2525 
2526  return false;
2527  }
2528 
2529  return false;
2530 }
2531 
2532 static int64_t getFoldableImm(const MachineOperand* MO) {
2533  if (!MO->isReg())
2534  return false;
2535  const MachineFunction *MF = MO->getParent()->getParent()->getParent();
2536  const MachineRegisterInfo &MRI = MF->getRegInfo();
2537  auto Def = MRI.getUniqueVRegDef(MO->getReg());
2538  if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
2539  Def->getOperand(1).isImm())
2540  return Def->getOperand(1).getImm();
2541  return AMDGPU::NoRegister;
2542 }
2543 
2545  MachineInstr &MI,
2546  LiveVariables *LV) const {
2547  unsigned Opc = MI.getOpcode();
2548  bool IsF16 = false;
2549  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2550  Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
2551 
2552  switch (Opc) {
2553  default:
2554  return nullptr;
2555  case AMDGPU::V_MAC_F16_e64:
2556  case AMDGPU::V_FMAC_F16_e64:
2557  IsF16 = true;
2559  case AMDGPU::V_MAC_F32_e64:
2560  case AMDGPU::V_FMAC_F32_e64:
2561  break;
2562  case AMDGPU::V_MAC_F16_e32:
2563  case AMDGPU::V_FMAC_F16_e32:
2564  IsF16 = true;
2566  case AMDGPU::V_MAC_F32_e32:
2567  case AMDGPU::V_FMAC_F32_e32: {
2568  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2569  AMDGPU::OpName::src0);
2570  const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2571  if (!Src0->isReg() && !Src0->isImm())
2572  return nullptr;
2573 
2574  if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2575  return nullptr;
2576 
2577  break;
2578  }
2579  }
2580 
2581  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2582  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2583  const MachineOperand *Src0Mods =
2584  getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2585  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2586  const MachineOperand *Src1Mods =
2587  getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2588  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2589  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2590  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2591 
2592  if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
2593  // If we have an SGPR input, we will violate the constant bus restriction.
2594  (ST.getConstantBusLimit(Opc) > 1 ||
2595  !Src0->isReg() ||
2596  !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
2597  if (auto Imm = getFoldableImm(Src2)) {
2598  unsigned NewOpc =
2599  IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
2600  : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
2601  if (pseudoToMCOpcode(NewOpc) != -1)
2602  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2603  .add(*Dst)
2604  .add(*Src0)
2605  .add(*Src1)
2606  .addImm(Imm);
2607  }
2608  unsigned NewOpc =
2609  IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
2610  : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
2611  if (auto Imm = getFoldableImm(Src1)) {
2612  if (pseudoToMCOpcode(NewOpc) != -1)
2613  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2614  .add(*Dst)
2615  .add(*Src0)
2616  .addImm(Imm)
2617  .add(*Src2);
2618  }
2619  if (auto Imm = getFoldableImm(Src0)) {
2620  if (pseudoToMCOpcode(NewOpc) != -1 &&
2622  AMDGPU::OpName::src0), Src1))
2623  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2624  .add(*Dst)
2625  .add(*Src1)
2626  .addImm(Imm)
2627  .add(*Src2);
2628  }
2629  }
2630 
2631  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32)
2632  : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2633  if (pseudoToMCOpcode(NewOpc) == -1)
2634  return nullptr;
2635 
2636  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2637  .add(*Dst)
2638  .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2639  .add(*Src0)
2640  .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2641  .add(*Src1)
2642  .addImm(0) // Src mods
2643  .add(*Src2)
2644  .addImm(Clamp ? Clamp->getImm() : 0)
2645  .addImm(Omod ? Omod->getImm() : 0);
2646 }
2647 
2648 // It's not generally safe to move VALU instructions across these since it will
2649 // start using the register as a base index rather than directly.
2650 // XXX - Why isn't hasSideEffects sufficient for these?
2652  switch (MI.getOpcode()) {
2653  case AMDGPU::S_SET_GPR_IDX_ON:
2654  case AMDGPU::S_SET_GPR_IDX_MODE:
2655  case AMDGPU::S_SET_GPR_IDX_OFF:
2656  return true;
2657  default:
2658  return false;
2659  }
2660 }
2661 
2663  const MachineBasicBlock *MBB,
2664  const MachineFunction &MF) const {
2665  // XXX - Do we want the SP check in the base implementation?
2666 
2667  // Target-independent instructions do not have an implicit-use of EXEC, even
2668  // when they operate on VGPRs. Treating EXEC modifications as scheduling
2669  // boundaries prevents incorrect movements of such instructions.
2670  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2671  MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2672  MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2673  MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2674  MI.getOpcode() == AMDGPU::S_DENORM_MODE ||
2676 }
2677 
2678 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
2679  return Opcode == AMDGPU::DS_ORDERED_COUNT ||
2680  Opcode == AMDGPU::DS_GWS_INIT ||
2681  Opcode == AMDGPU::DS_GWS_SEMA_V ||
2682  Opcode == AMDGPU::DS_GWS_SEMA_BR ||
2683  Opcode == AMDGPU::DS_GWS_SEMA_P ||
2684  Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
2685  Opcode == AMDGPU::DS_GWS_BARRIER;
2686 }
2687 
2689  unsigned Opcode = MI.getOpcode();
2690 
2691  if (MI.mayStore() && isSMRD(MI))
2692  return true; // scalar store or atomic
2693 
2694  // This will terminate the function when other lanes may need to continue.
2695  if (MI.isReturn())
2696  return true;
2697 
2698  // These instructions cause shader I/O that may cause hardware lockups
2699  // when executed with an empty EXEC mask.
2700  //
2701  // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
2702  // EXEC = 0, but checking for that case here seems not worth it
2703  // given the typical code patterns.
2704  if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
2705  Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
2706  Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
2707  Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
2708  return true;
2709 
2710  if (MI.isCall() || MI.isInlineAsm())
2711  return true; // conservative assumption
2712 
2713  // These are like SALU instructions in terms of effects, so it's questionable
2714  // whether we should return true for those.
2715  //
2716  // However, executing them with EXEC = 0 causes them to operate on undefined
2717  // data, which we avoid by returning true here.
2718  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
2719  return true;
2720 
2721  return false;
2722 }
2723 
2725  const MachineInstr &MI) const {
2726  if (MI.isMetaInstruction())
2727  return false;
2728 
2729  // This won't read exec if this is an SGPR->SGPR copy.
2730  if (MI.isCopyLike()) {
2731  if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
2732  return true;
2733 
2734  // Make sure this isn't copying exec as a normal operand
2735  return MI.readsRegister(AMDGPU::EXEC, &RI);
2736  }
2737 
2738  // Make a conservative assumption about the callee.
2739  if (MI.isCall())
2740  return true;
2741 
2742  // Be conservative with any unhandled generic opcodes.
2743  if (!isTargetSpecificOpcode(MI.getOpcode()))
2744  return true;
2745 
2746  return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
2747 }
2748 
2749 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2750  switch (Imm.getBitWidth()) {
2751  case 1: // This likely will be a condition code mask.
2752  return true;
2753 
2754  case 32:
2756  ST.hasInv2PiInlineImm());
2757  case 64:
2759  ST.hasInv2PiInlineImm());
2760  case 16:
2761  return ST.has16BitInsts() &&
2763  ST.hasInv2PiInlineImm());
2764  default:
2765  llvm_unreachable("invalid bitwidth");
2766  }
2767 }
2768 
2770  uint8_t OperandType) const {
2771  if (!MO.isImm() ||
2772  OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2773  OperandType > AMDGPU::OPERAND_SRC_LAST)
2774  return false;
2775 
2776  // MachineOperand provides no way to tell the true operand size, since it only
2777  // records a 64-bit value. We need to know the size to determine if a 32-bit
2778  // floating point immediate bit pattern is legal for an integer immediate. It
2779  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2780 
2781  int64_t Imm = MO.getImm();
2782  switch (OperandType) {
2789  int32_t Trunc = static_cast<int32_t>(Imm);
2791  }
2797  ST.hasInv2PiInlineImm());
2804  if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2805  // A few special case instructions have 16-bit operands on subtargets
2806  // where 16-bit instructions are not legal.
2807  // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2808  // constants in these cases
2809  int16_t Trunc = static_cast<int16_t>(Imm);
2810  return ST.has16BitInsts() &&
2812  }
2813 
2814  return false;
2815  }
2822  uint32_t Trunc = static_cast<uint32_t>(Imm);
2824  }
2825  default:
2826  llvm_unreachable("invalid bitwidth");
2827  }
2828 }
2829 
2831  const MCOperandInfo &OpInfo) const {
2832  switch (MO.getType()) {
2834  return false;
2836  return !isInlineConstant(MO, OpInfo);
2842  return true;
2843  default:
2844  llvm_unreachable("unexpected operand type");
2845  }
2846 }
2847 
2848 static bool compareMachineOp(const MachineOperand &Op0,
2849  const MachineOperand &Op1) {
2850  if (Op0.getType() != Op1.getType())
2851  return false;
2852 
2853  switch (Op0.getType()) {
2855  return Op0.getReg() == Op1.getReg();
2857  return Op0.getImm() == Op1.getImm();
2858  default:
2859  llvm_unreachable("Didn't expect to be comparing these operand types");
2860  }
2861 }
2862 
2864  const MachineOperand &MO) const {
2865  const MCInstrDesc &InstDesc = MI.getDesc();
2866  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
2867 
2868  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
2869 
2870  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2871  return true;
2872 
2873  if (OpInfo.RegClass < 0)
2874  return false;
2875 
2876  if (MO.isImm() && isInlineConstant(MO, OpInfo))
2877  return RI.opCanUseInlineConstant(OpInfo.OperandType);
2878 
2879  if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
2880  return false;
2881 
2882  if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
2883  return true;
2884 
2885  const MachineFunction *MF = MI.getParent()->getParent();
2886  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2887  return ST.hasVOP3Literal();
2888 }
2889 
2890 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2891  int Op32 = AMDGPU::getVOPe32(Opcode);
2892  if (Op32 == -1)
2893  return false;
2894 
2895  return pseudoToMCOpcode(Op32) != -1;
2896 }
2897 
2898 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2899  // The src0_modifier operand is present on all instructions
2900  // that have modifiers.
2901 
2902  return AMDGPU::getNamedOperandIdx(Opcode,
2903  AMDGPU::OpName::src0_modifiers) != -1;
2904 }
2905 
2907  unsigned OpName) const {
2908  const MachineOperand *Mods = getNamedOperand(MI, OpName);
2909  return Mods && Mods->getImm();
2910 }
2911 
2913  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2914  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2915  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2916  hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2917  hasModifiersSet(MI, AMDGPU::OpName::omod);
2918 }
2919 
2921  const MachineRegisterInfo &MRI) const {
2922  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2923  // Can't shrink instruction with three operands.
2924  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
2925  // a special case for it. It can only be shrunk if the third operand
2926  // is vcc, and src0_modifiers and src1_modifiers are not set.
2927  // We should handle this the same way we handle vopc, by addding
2928  // a register allocation hint pre-regalloc and then do the shrinking
2929  // post-regalloc.
2930  if (Src2) {
2931  switch (MI.getOpcode()) {
2932  default: return false;
2933 
2934  case AMDGPU::V_ADDC_U32_e64:
2935  case AMDGPU::V_SUBB_U32_e64:
2936  case AMDGPU::V_SUBBREV_U32_e64: {
2937  const MachineOperand *Src1
2938  = getNamedOperand(MI, AMDGPU::OpName::src1);
2939  if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
2940  return false;
2941  // Additional verification is needed for sdst/src2.
2942  return true;
2943  }
2944  case AMDGPU::V_MAC_F32_e64:
2945  case AMDGPU::V_MAC_F16_e64:
2946  case AMDGPU::V_FMAC_F32_e64:
2947  case AMDGPU::V_FMAC_F16_e64:
2948  if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
2949  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
2950  return false;
2951  break;
2952 
2953  case AMDGPU::V_CNDMASK_B32_e64:
2954  break;
2955  }
2956  }
2957 
2958  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2959  if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
2960  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
2961  return false;
2962 
2963  // We don't need to check src0, all input types are legal, so just make sure
2964  // src0 isn't using any modifiers.
2965  if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
2966  return false;
2967 
2968  // Can it be shrunk to a valid 32 bit opcode?
2969  if (!hasVALU32BitEncoding(MI.getOpcode()))
2970  return false;
2971 
2972  // Check output modifiers
2973  return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
2974  !hasModifiersSet(MI, AMDGPU::OpName::clamp);
2975 }
2976 
2977 // Set VCC operand with all flags from \p Orig, except for setting it as
2978 // implicit.
2980  const MachineOperand &Orig) {
2981 
2982  for (MachineOperand &Use : MI.implicit_operands()) {
2983  if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
2984  Use.setIsUndef(Orig.isUndef());
2985  Use.setIsKill(Orig.isKill());
2986  return;
2987  }
2988  }
2989 }
2990 
2992  unsigned Op32) const {
2993  MachineBasicBlock *MBB = MI.getParent();;
2994  MachineInstrBuilder Inst32 =
2995  BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
2996 
2997  // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
2998  // For VOPC instructions, this is replaced by an implicit def of vcc.
2999  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
3000  if (Op32DstIdx != -1) {
3001  // dst
3002  Inst32.add(MI.getOperand(0));
3003  } else {
3004  assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
3005  (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
3006  "Unexpected case");
3007  }
3008 
3009  Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
3010 
3011  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3012  if (Src1)
3013  Inst32.add(*Src1);
3014 
3015  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3016 
3017  if (Src2) {
3018  int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
3019  if (Op32Src2Idx != -1) {
3020  Inst32.add(*Src2);
3021  } else {
3022  // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
3023  // replaced with an implicit read of vcc. This was already added
3024  // during the initial BuildMI, so find it to preserve the flags.
3025  copyFlagsToImplicitVCC(*Inst32, *Src2);
3026  }
3027  }
3028 
3029  return Inst32;
3030 }
3031 
3033  const MachineOperand &MO,
3034  const MCOperandInfo &OpInfo) const {
3035  // Literal constants use the constant bus.
3036  //if (isLiteralConstantLike(MO, OpInfo))
3037  // return true;
3038  if (MO.isImm())
3039  return !isInlineConstant(MO, OpInfo);
3040 
3041  if (!MO.isReg())
3042  return true; // Misc other operands like FrameIndex
3043 
3044  if (!MO.isUse())
3045  return false;
3046 
3048  return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
3049 
3050  // Null is free
3051  if (MO.getReg() == AMDGPU::SGPR_NULL)
3052  return false;
3053 
3054  // SGPRs use the constant bus
3055  if (MO.isImplicit()) {
3056  return MO.getReg() == AMDGPU::M0 ||
3057  MO.getReg() == AMDGPU::VCC ||
3058  MO.getReg() == AMDGPU::VCC_LO;
3059  } else {
3060  return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
3061  AMDGPU::SReg_64RegClass.contains(MO.getReg());
3062  }
3063 }
3064 
3065 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
3066  for (const MachineOperand &MO : MI.implicit_operands()) {
3067  // We only care about reads.
3068  if (MO.isDef())
3069  continue;
3070 
3071  switch (MO.getReg()) {
3072  case AMDGPU::VCC:
3073  case AMDGPU::VCC_LO:
3074  case AMDGPU::VCC_HI:
3075  case AMDGPU::M0:
3076  case AMDGPU::FLAT_SCR:
3077  return MO.getReg();
3078 
3079  default:
3080  break;
3081  }
3082  }
3083 
3084  return AMDGPU::NoRegister;
3085 }
3086 
3087 static bool shouldReadExec(const MachineInstr &MI) {
3088  if (SIInstrInfo::isVALU(MI)) {
3089  switch (MI.getOpcode()) {
3090  case AMDGPU::V_READLANE_B32:
3091  case AMDGPU::V_READLANE_B32_gfx6_gfx7:
3092  case AMDGPU::V_READLANE_B32_gfx10:
3093  case AMDGPU::V_READLANE_B32_vi:
3094  case AMDGPU::V_WRITELANE_B32:
3095  case AMDGPU::V_WRITELANE_B32_gfx6_gfx7:
3096  case AMDGPU::V_WRITELANE_B32_gfx10:
3097  case AMDGPU::V_WRITELANE_B32_vi:
3098  return false;
3099  }
3100 
3101  return true;
3102  }
3103 
3104  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
3105  SIInstrInfo::isSALU(MI) ||
3106  SIInstrInfo::isSMRD(MI))
3107  return false;
3108 
3109  return true;
3110 }
3111 
3112 static bool isSubRegOf(const SIRegisterInfo &TRI,
3113  const MachineOperand &SuperVec,
3114  const MachineOperand &SubReg) {
3115  if (Register::isPhysicalRegister(SubReg.getReg()))
3116  return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
3117 
3118  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
3119  SubReg.getReg() == SuperVec.getReg();
3120 }
3121 
3123  StringRef &ErrInfo) const {
3124  uint16_t Opcode = MI.getOpcode();
3125  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
3126  return true;
3127 
3128  const MachineFunction *MF = MI.getParent()->getParent();
3129  const MachineRegisterInfo &MRI = MF->getRegInfo();
3130 
3131  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
3132  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
3133  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
3134 
3135  // Make sure the number of operands is correct.
3136  const MCInstrDesc &Desc = get(Opcode);
3137  if (!Desc.isVariadic() &&
3138  Desc.getNumOperands() != MI.getNumExplicitOperands()) {
3139  ErrInfo = "Instruction has wrong number of operands.";
3140  return false;
3141  }
3142 
3143  if (MI.isInlineAsm()) {
3144  // Verify register classes for inlineasm constraints.
3145  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
3146  I != E; ++I) {
3147  const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
3148  if (!RC)
3149  continue;
3150 
3151  const MachineOperand &Op = MI.getOperand(I);
3152  if (!Op.isReg())
3153  continue;
3154 
3155  Register Reg = Op.getReg();
3156  if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) {
3157  ErrInfo = "inlineasm operand has incorrect register class.";
3158  return false;
3159  }
3160  }
3161 
3162  return true;
3163  }
3164 
3165  // Make sure the register classes are correct.
3166  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
3167  if (MI.getOperand(i).isFPImm()) {
3168  ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
3169  "all fp values to integers.";
3170  return false;
3171  }
3172 
3173  int RegClass = Desc.OpInfo[i].RegClass;
3174 
3175  switch (Desc.OpInfo[i].OperandType) {
3177  if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
3178  ErrInfo = "Illegal immediate value for operand.";
3179  return false;
3180  }
3181  break;
3184  break;
3195  const MachineOperand &MO = MI.getOperand(i);
3196  if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
3197  ErrInfo = "Illegal immediate value for operand.";
3198  return false;
3199  }
3200  break;
3201  }
3204  // Check if this operand is an immediate.
3205  // FrameIndex operands will be replaced by immediates, so they are
3206  // allowed.
3207  if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
3208  ErrInfo = "Expected immediate, but got non-immediate";
3209  return false;
3210  }
3212  default:
3213  continue;
3214  }
3215 
3216  if (!MI.getOperand(i).isReg())
3217  continue;
3218 
3219  if (RegClass != -1) {
3220  Register Reg = MI.getOperand(i).getReg();
3221  if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg))
3222  continue;
3223 
3224  const TargetRegisterClass *RC = RI.getRegClass(RegClass);
3225  if (!RC->contains(Reg)) {
3226  ErrInfo = "Operand has incorrect register class.";
3227  return false;
3228  }
3229  }
3230  }
3231 
3232  // Verify SDWA
3233  if (isSDWA(MI)) {
3234  if (!ST.hasSDWA()) {
3235  ErrInfo = "SDWA is not supported on this target";
3236  return false;
3237  }
3238 
3239  int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
3240 
3241  const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
3242 
3243  for (int OpIdx: OpIndicies) {
3244  if (OpIdx == -1)
3245  continue;
3246  const MachineOperand &MO = MI.getOperand(OpIdx);
3247 
3248  if (!ST.hasSDWAScalar()) {
3249  // Only VGPRS on VI
3250  if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
3251  ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
3252  return false;
3253  }
3254  } else {
3255  // No immediates on GFX9
3256  if (!MO.isReg()) {
3257  ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
3258  return false;
3259  }
3260  }
3261  }
3262 
3263  if (!ST.hasSDWAOmod()) {
3264  // No omod allowed on VI
3265  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
3266  if (OMod != nullptr &&
3267  (!OMod->isImm() || OMod->getImm() != 0)) {
3268  ErrInfo = "OMod not allowed in SDWA instructions on VI";
3269  return false;
3270  }
3271  }
3272 
3273  uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
3274  if (isVOPC(BasicOpcode)) {
3275  if (!ST.hasSDWASdst() && DstIdx != -1) {
3276  // Only vcc allowed as dst on VI for VOPC
3277  const MachineOperand &Dst = MI.getOperand(DstIdx);
3278  if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
3279  ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
3280  return false;
3281  }
3282  } else if (!ST.hasSDWAOutModsVOPC()) {
3283  // No clamp allowed on GFX9 for VOPC
3284  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3285  if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
3286  ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
3287  return false;
3288  }
3289 
3290  // No omod allowed on GFX9 for VOPC
3291  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
3292  if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
3293  ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
3294  return false;
3295  }
3296  }
3297  }
3298 
3299  const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
3300  if (DstUnused && DstUnused->isImm() &&
3301  DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
3302  const MachineOperand &Dst = MI.getOperand(DstIdx);
3303  if (!Dst.isReg() || !Dst.isTied()) {
3304  ErrInfo = "Dst register should have tied register";
3305  return false;
3306  }
3307 
3308  const MachineOperand &TiedMO =
3309  MI.getOperand(MI.findTiedOperandIdx(DstIdx));
3310  if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
3311  ErrInfo =
3312  "Dst register should be tied to implicit use of preserved register";
3313  return false;
3314  } else if (Register::isPhysicalRegister(TiedMO.getReg()) &&
3315  Dst.getReg() != TiedMO.getReg()) {
3316  ErrInfo = "Dst register should use same physical register as preserved";
3317  return false;
3318  }
3319  }
3320  }
3321 
3322  // Verify MIMG
3323  if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
3324  // Ensure that the return type used is large enough for all the options
3325  // being used TFE/LWE require an extra result register.
3326  const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
3327  if (DMask) {
3328  uint64_t DMaskImm = DMask->getImm();
3329  uint32_t RegCount =
3330  isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
3331  const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
3332  const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
3333  const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
3334 
3335  // Adjust for packed 16 bit values
3336  if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
3337  RegCount >>= 1;
3338 
3339  // Adjust if using LWE or TFE
3340  if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
3341  RegCount += 1;
3342 
3343  const uint32_t DstIdx =
3344  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
3345  const MachineOperand &Dst = MI.getOperand(DstIdx);
3346  if (Dst.isReg()) {
3347  const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
3348  uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
3349  if (RegCount > DstSize) {
3350  ErrInfo = "MIMG instruction returns too many registers for dst "
3351  "register class";
3352  return false;
3353  }
3354  }
3355  }
3356  }
3357 
3358  // Verify VOP*. Ignore multiple sgpr operands on writelane.
3359  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
3360  && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
3361  // Only look at the true operands. Only a real operand can use the constant
3362  // bus, and we don't want to check pseudo-operands like the source modifier
3363  // flags.
3364  const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
3365 
3366  unsigned ConstantBusCount = 0;
3367  unsigned LiteralCount = 0;
3368 
3369  if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
3370  ++ConstantBusCount;
3371 
3372  SmallVector<unsigned, 2> SGPRsUsed;
3373  unsigned SGPRUsed = findImplicitSGPRRead(MI);
3374  if (SGPRUsed != AMDGPU::NoRegister) {
3375  ++ConstantBusCount;
3376  SGPRsUsed.push_back(SGPRUsed);
3377  }
3378 
3379  for (int OpIdx : OpIndices) {
3380  if (OpIdx == -1)
3381  break;
3382  const MachineOperand &MO = MI.getOperand(OpIdx);
3383  if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
3384  if (MO.isReg()) {
3385  SGPRUsed = MO.getReg();
3386  if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
3387  return !RI.regsOverlap(SGPRUsed, SGPR);
3388  })) {
3389  ++ConstantBusCount;
3390  SGPRsUsed.push_back(SGPRUsed);
3391  }
3392  } else {
3393  ++ConstantBusCount;
3394  ++LiteralCount;
3395  }
3396  }
3397  }
3398  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3399  // v_writelane_b32 is an exception from constant bus restriction:
3400  // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
3401  if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
3402  Opcode != AMDGPU::V_WRITELANE_B32) {
3403  ErrInfo = "VOP* instruction violates constant bus restriction";
3404  return false;
3405  }
3406 
3407  if (isVOP3(MI) && LiteralCount) {
3408  if (LiteralCount && !ST.hasVOP3Literal()) {
3409  ErrInfo = "VOP3 instruction uses literal";
3410  return false;
3411  }
3412  if (LiteralCount > 1) {
3413  ErrInfo = "VOP3 instruction uses more than one literal";
3414  return false;
3415  }
3416  }
3417  }
3418 
3419  // Verify misc. restrictions on specific instructions.
3420  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
3421  Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
3422  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3423  const MachineOperand &Src1 = MI.getOperand(Src1Idx);
3424  const MachineOperand &Src2 = MI.getOperand(Src2Idx);
3425  if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
3426  if (!compareMachineOp(Src0, Src1) &&
3427  !compareMachineOp(Src0, Src2)) {
3428  ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
3429  return false;
3430  }
3431  }
3432  }
3433 
3434  if (isSOP2(MI) || isSOPC(MI)) {
3435  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3436  const MachineOperand &Src1 = MI.getOperand(Src1Idx);
3437  unsigned Immediates = 0;
3438 
3439  if (!Src0.isReg() &&
3440  !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType))
3441  Immediates++;
3442  if (!Src1.isReg() &&
3443  !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType))
3444  Immediates++;
3445 
3446  if (Immediates > 1) {
3447  ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
3448  return false;
3449  }
3450  }
3451 
3452  if (isSOPK(MI)) {
3453  auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
3454  if (Desc.isBranch()) {
3455  if (!Op->isMBB()) {
3456  ErrInfo = "invalid branch target for SOPK instruction";
3457  return false;
3458  }
3459  } else {
3460  uint64_t Imm = Op->getImm();
3461  if (sopkIsZext(MI)) {
3462  if (!isUInt<16>(Imm)) {
3463  ErrInfo = "invalid immediate for SOPK instruction";
3464  return false;
3465  }
3466  } else {
3467  if (!isInt<16>(Imm)) {
3468  ErrInfo = "invalid immediate for SOPK instruction";
3469  return false;
3470  }
3471  }
3472  }
3473  }
3474 
3475  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
3476  Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
3477  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3478  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
3479  const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3480  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
3481 
3482  const unsigned StaticNumOps = Desc.getNumOperands() +
3483  Desc.getNumImplicitUses();
3484  const unsigned NumImplicitOps = IsDst ? 2 : 1;
3485 
3486  // Allow additional implicit operands. This allows a fixup done by the post
3487  // RA scheduler where the main implicit operand is killed and implicit-defs
3488  // are added for sub-registers that remain live after this instruction.
3489  if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
3490  ErrInfo = "missing implicit register operands";
3491  return false;
3492  }
3493 
3494  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3495  if (IsDst) {
3496  if (!Dst->isUse()) {
3497  ErrInfo = "v_movreld_b32 vdst should be a use operand";
3498  return false;
3499  }
3500 
3501  unsigned UseOpIdx;
3502  if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
3503  UseOpIdx != StaticNumOps + 1) {
3504  ErrInfo = "movrel implicit operands should be tied";
3505  return false;
3506  }
3507  }
3508 
3509  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3510  const MachineOperand &ImpUse
3511  = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
3512  if (!ImpUse.isReg() || !ImpUse.isUse() ||
3513  !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
3514  ErrInfo = "src0 should be subreg of implicit vector use";
3515  return false;
3516  }
3517  }
3518 
3519  // Make sure we aren't losing exec uses in the td files. This mostly requires
3520  // being careful when using let Uses to try to add other use registers.
3521  if (shouldReadExec(MI)) {
3522  if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
3523  ErrInfo = "VALU instruction does not implicitly read exec mask";
3524  return false;
3525  }
3526  }
3527 
3528  if (isSMRD(MI)) {
3529  if (MI.mayStore()) {
3530  // The register offset form of scalar stores may only use m0 as the
3531  // soffset register.
3532  const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
3533  if (Soff && Soff->getReg() != AMDGPU::M0) {
3534  ErrInfo = "scalar stores must use m0 as offset register";
3535  return false;
3536  }
3537  }
3538  }
3539 
3540  if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
3541  const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3542  if (Offset->getImm() != 0) {
3543  ErrInfo = "subtarget does not support offsets in flat instructions";
3544  return false;
3545  }
3546  }
3547 
3548  if (isMIMG(MI)) {
3549  const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
3550  if (DimOp) {
3551  int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
3552  AMDGPU::OpName::vaddr0);
3553  int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
3554  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
3555  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3557  const AMDGPU::MIMGDimInfo *Dim =
3559 
3560  if (!Dim) {
3561  ErrInfo = "dim is out of range";
3562  return false;
3563  }
3564 
3565  bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
3566  unsigned AddrWords = BaseOpcode->NumExtraArgs +
3567  (BaseOpcode->Gradients ? Dim->NumGradients : 0) +
3568  (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
3569  (BaseOpcode->LodOrClampOrMip ? 1 : 0);
3570 
3571  unsigned VAddrWords;
3572  if (IsNSA) {
3573  VAddrWords = SRsrcIdx - VAddr0Idx;
3574  } else {
3575  const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
3576  VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
3577  if (AddrWords > 8)
3578  AddrWords = 16;
3579  else if (AddrWords > 4)
3580  AddrWords = 8;
3581  else if (AddrWords == 3 && VAddrWords == 4) {
3582  // CodeGen uses the V4 variant of instructions for three addresses,
3583  // because the selection DAG does not support non-power-of-two types.
3584  AddrWords = 4;
3585  }
3586  }
3587 
3588  if (VAddrWords != AddrWords) {
3589  ErrInfo = "bad vaddr size";
3590  return false;
3591  }
3592  }
3593  }
3594 
3595  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
3596  if (DppCt) {
3597  using namespace AMDGPU::DPP;
3598 
3599  unsigned DC = DppCt->getImm();
3600  if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
3601  DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
3607  ErrInfo = "Invalid dpp_ctrl value";
3608  return false;
3609  }
3610  if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
3611  ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
3612  ErrInfo = "Invalid dpp_ctrl value: "
3613  "wavefront shifts are not supported on GFX10+";
3614  return false;
3615  }
3616  if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
3617  ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
3618  ErrInfo = "Invalid dpp_ctrl value: "
3619  "broadcasts are not supported on GFX10+";
3620  return false;
3621  }
3623  ST.getGeneration() < AMDGPUSubtarget::GFX10) {
3624  ErrInfo = "Invalid dpp_ctrl value: "
3625  "row_share and row_xmask are not supported before GFX10";
3626  return false;
3627  }
3628  }
3629 
3630  return true;
3631 }
3632 
3633 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
3634  switch (MI.getOpcode()) {
3635  default: return AMDGPU::INSTRUCTION_LIST_END;
3636  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
3637  case AMDGPU::COPY: return AMDGPU::COPY;
3638  case AMDGPU::PHI: return AMDGPU::PHI;
3639  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
3640  case AMDGPU::WQM: return AMDGPU::WQM;
3641  case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
3642  case AMDGPU::WWM: return AMDGPU::WWM;
3643  case AMDGPU::S_MOV_B32: {
3645  return MI.getOperand(1).isReg() ||
3646  RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
3647  AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
3648  }
3649  case AMDGPU::S_ADD_I32:
3650  return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
3651  case AMDGPU::S_ADDC_U32:
3652  return AMDGPU::V_ADDC_U32_e32;
3653  case AMDGPU::S_SUB_I32:
3654  return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
3655  // FIXME: These are not consistently handled, and selected when the carry is
3656  // used.
3657  case AMDGPU::S_ADD_U32:
3658  return AMDGPU::V_ADD_I32_e32;
3659  case AMDGPU::S_SUB_U32:
3660  return AMDGPU::V_SUB_I32_e32;
3661  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
3662  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32;
3663  case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32;
3664  case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32;
3665  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
3666  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
3667  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
3668  case AMDGPU::S_XNOR_B32:
3669  return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
3670  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
3671  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
3672  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
3673  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
3674  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
3675  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
3676  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
3677  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
3678  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
3679  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
3680  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
3681  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
3682  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
3683  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
3684  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
3685  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
3686  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
3687  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
3688  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
3689  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
3690  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
3691  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
3692  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
3693  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
3694  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
3695  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
3696  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
3697  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
3698  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
3699  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
3700  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
3701  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
3702  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
3703  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
3704  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
3705  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
3706  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
3707  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
3708  }
3710  "Unexpected scalar opcode without corresponding vector one!");
3711 }
3712 
3714  unsigned OpNo) const {
3716  const MCInstrDesc &Desc = get(MI.getOpcode());
3717  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
3718  Desc.OpInfo[OpNo].RegClass == -1) {
3719  Register Reg = MI.getOperand(OpNo).getReg();
3720 
3721  if (Register::isVirtualRegister(Reg))
3722  return MRI.getRegClass(Reg);
3723  return RI.getPhysRegClass(Reg);
3724  }
3725 
3726  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
3727  return RI.getRegClass(RCID);
3728 }
3729 
3730 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
3732  MachineBasicBlock *MBB = MI.getParent();
3733  MachineOperand &MO = MI.getOperand(OpIdx);
3735  const SIRegisterInfo *TRI =
3736  static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3737  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
3738  const TargetRegisterClass *RC = RI.getRegClass(RCID);
3739  unsigned Size = TRI->getRegSizeInBits(*RC);
3740  unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
3741  if (MO.isReg())
3742  Opcode = AMDGPU::COPY;
3743  else if (RI.isSGPRClass(RC))
3744  Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
3745 
3746  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
3747  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
3748  VRC = &AMDGPU::VReg_64RegClass;
3749  else
3750  VRC = &AMDGPU::VGPR_32RegClass;
3751 
3752  Register Reg = MRI.createVirtualRegister(VRC);
3753  DebugLoc DL = MBB->findDebugLoc(I);
3754  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
3755  MO.ChangeToRegister(Reg, false);
3756 }
3757 
3760  MachineOperand &SuperReg,
3761  const TargetRegisterClass *SuperRC,
3762  unsigned SubIdx,
3763  const TargetRegisterClass *SubRC)
3764  const {
3765  MachineBasicBlock *MBB = MI->getParent();
3766  DebugLoc DL = MI->getDebugLoc();
3767  Register SubReg = MRI.createVirtualRegister(SubRC);
3768 
3769  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
3770  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3771  .addReg(SuperReg.getReg(), 0, SubIdx);
3772  return SubReg;
3773  }
3774 
3775  // Just in case the super register is itself a sub-register, copy it to a new
3776  // value so we don't need to worry about merging its subreg index with the
3777  // SubIdx passed to this function. The register coalescer should be able to
3778  // eliminate this extra copy.
3779  Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
3780 
3781  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
3782  .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
3783 
3784  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3785  .addReg(NewSuperReg, 0, SubIdx);
3786 
3787  return SubReg;
3788 }
3789 
3793  MachineOperand &Op,
3794  const TargetRegisterClass *SuperRC,
3795  unsigned SubIdx,
3796  const TargetRegisterClass *SubRC) const {
3797  if (Op.isImm()) {
3798  if (SubIdx == AMDGPU::sub0)
3799  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
3800  if (SubIdx == AMDGPU::sub1)
3801  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
3802 
3803  llvm_unreachable("Unhandled register index for immediate");
3804  }
3805 
3806  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
3807  SubIdx, SubRC);
3808  return MachineOperand::CreateReg(SubReg, false);
3809 }
3810 
3811 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
3812 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
3813  assert(Inst.getNumExplicitOperands() == 3);
3814  MachineOperand Op1 = Inst.getOperand(1);
3815  Inst.RemoveOperand(1);
3816  Inst.addOperand(Op1);
3817 }
3818 
3820  const MCOperandInfo &OpInfo,
3821  const MachineOperand &MO) const {
3822  if (!MO.isReg())
3823  return false;
3824 
3825  Register Reg = MO.getReg();
3827  ? MRI.getRegClass(Reg)
3828  : RI.getPhysRegClass(Reg);
3829 
3830  const SIRegisterInfo *TRI =
3831  static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3832  RC = TRI->getSubRegClass(RC, MO.getSubReg());
3833 
3834  // In order to be legal, the common sub-class must be equal to the
3835  // class of the current operand. For example:
3836  //
3837  // v_mov_b32 s0 ; Operand defined as vsrc_b32
3838  // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
3839  //
3840  // s_sendmsg 0, s0 ; Operand defined as m0reg
3841  // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
3842 
3843  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
3844 }
3845 
3847  const MCOperandInfo &OpInfo,
3848  const MachineOperand &MO) const {
3849  if (MO.isReg())
3850  return isLegalRegOperand(MRI, OpInfo, MO);
3851 
3852  // Handle non-register types that are treated like immediates.
3853  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
3854  return true;
3855 }
3856 
3857 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
3858  const MachineOperand *MO) const {
3859  const MachineFunction &MF = *MI.getParent()->getParent();
3860  const MachineRegisterInfo &MRI = MF.getRegInfo();
3861  const MCInstrDesc &InstDesc = MI.getDesc();
3862  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
3863  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3864  const TargetRegisterClass *DefinedRC =
3865  OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
3866  if (!MO)
3867  MO = &MI.getOperand(OpIdx);
3868 
3869  int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
3870  int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
3871  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
3872  if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--)
3873  return false;
3874 
3875  SmallDenseSet<RegSubRegPair> SGPRsUsed;
3876  if (MO->isReg())
3877  SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
3878 
3879  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3880  if (i == OpIdx)
3881  continue;
3882  const MachineOperand &Op = MI.getOperand(i);
3883  if (Op.isReg()) {
3884  RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
3885  if (!SGPRsUsed.count(SGPR) &&
3886  usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
3887  if (--ConstantBusLimit <= 0)
3888  return false;
3889  SGPRsUsed.insert(SGPR);
3890  }
3891  } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
3892  if (--ConstantBusLimit <= 0)
3893  return false;
3894  } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) &&
3895  isLiteralConstantLike(Op, InstDesc.OpInfo[i])) {
3896  if (!VOP3LiteralLimit--)
3897  return false;
3898  if (--ConstantBusLimit <= 0)
3899  return false;
3900  }
3901  }
3902  }
3903 
3904  if (MO->isReg()) {
3905  assert(DefinedRC);
3906  return isLegalRegOperand(MRI, OpInfo, *MO);
3907  }
3908 
3909  // Handle non-register types that are treated like immediates.
3910  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
3911 
3912  if (!DefinedRC) {
3913  // This operand expects an immediate.
3914  return true;
3915  }
3916 
3917  return isImmOperandLegal(MI, OpIdx, *MO);
3918 }
3919 
3921  MachineInstr &MI) const {
3922  unsigned Opc = MI.getOpcode();
3923  const MCInstrDesc &InstrDesc = get(Opc);
3924 
3925  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3926  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3927 
3928  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3929  MachineOperand &Src1 = MI.getOperand(Src1Idx);
3930 
3931  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
3932  // we need to only have one constant bus use before GFX10.
3933  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3934  if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 &&
3935  Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) ||
3936  isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx])))
3937  legalizeOpWithMove(MI, Src0Idx);
3938 
3939  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
3940  // both the value to write (src0) and lane select (src1). Fix up non-SGPR
3941  // src0/src1 with V_READFIRSTLANE.
3942  if (Opc == AMDGPU::V_WRITELANE_B32) {
3943  const DebugLoc &DL = MI.getDebugLoc();
3944  if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
3945  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3946  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3947  .add(Src0);
3948  Src0.ChangeToRegister(Reg, false);
3949  }
3950  if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
3951  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3952  const DebugLoc &DL = MI.getDebugLoc();
3953  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3954  .add(Src1);
3955  Src1.ChangeToRegister(Reg, false);
3956  }
3957  return;
3958  }
3959 
3960  // No VOP2 instructions support AGPRs.
3961  if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
3962  legalizeOpWithMove(MI, Src0Idx);
3963 
3964  if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
3965  legalizeOpWithMove(MI, Src1Idx);
3966 
3967  // VOP2 src0 instructions support all operand types, so we don't need to check
3968  // their legality. If src1 is already legal, we don't need to do anything.
3969  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3970  return;
3971 
3972  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3973  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3974  // select is uniform.
3975  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
3976  RI.isVGPR(MRI, Src1.getReg())) {
3977  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3978  const DebugLoc &DL = MI.getDebugLoc();
3979  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3980  .add(Src1);
3981  Src1.ChangeToRegister(Reg, false);
3982  return;
3983  }
3984 
3985  // We do not use commuteInstruction here because it is too aggressive and will
3986  // commute if it is possible. We only want to commute here if it improves
3987  // legality. This can be called a fairly large number of times so don't waste
3988  // compile time pointlessly swapping and checking legality again.
3989  if (HasImplicitSGPR || !MI.isCommutable()) {
3990  legalizeOpWithMove(MI, Src1Idx);
3991  return;
3992  }
3993 
3994  // If src0 can be used as src1, commuting will make the operands legal.
3995  // Otherwise we have to give up and insert a move.
3996  //
3997  // TODO: Other immediate-like operand kinds could be commuted if there was a
3998  // MachineOperand::ChangeTo* for them.
3999  if ((!Src1.isImm() && !Src1.isReg()) ||
4000  !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
4001  legalizeOpWithMove(MI, Src1Idx);
4002  return;
4003  }
4004 
4005  int CommutedOpc = commuteOpcode(MI);
4006  if (CommutedOpc == -1) {
4007  legalizeOpWithMove(MI, Src1Idx);
4008  return;
4009  }
4010 
4011  MI.setDesc(get(CommutedOpc));
4012 
4013  Register Src0Reg = Src0.getReg();
4014  unsigned Src0SubReg = Src0.getSubReg();
4015  bool Src0Kill = Src0.isKill();
4016 
4017  if (Src1.isImm())
4018  Src0.ChangeToImmediate(Src1.getImm());
4019  else if (Src1.isReg()) {
4020  Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
4021  Src0.setSubReg(Src1.getSubReg());
4022  } else
4023  llvm_unreachable("Should only have register or immediate operands");
4024 
4025  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
4026  Src1.setSubReg(Src0SubReg);
4027  fixImplicitOperands(MI);
4028 }
4029 
4030 // Legalize VOP3 operands. All operand types are supported for any operand
4031 // but only one literal constant and only starting from GFX10.
4033  MachineInstr &MI) const {
4034  unsigned Opc = MI.getOpcode();
4035 
4036  int VOP3Idx[3] = {
4037  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
4038  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
4039  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
4040  };
4041 
4042  if (Opc == AMDGPU::V_PERMLANE16_B32 ||
4043  Opc == AMDGPU::V_PERMLANEX16_B32) {
4044  // src1 and src2 must be scalar
4045  MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
4046  MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
4047  const DebugLoc &DL = MI.getDebugLoc();
4048  if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
4049  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4050  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4051  .add(Src1);
4052  Src1.ChangeToRegister(Reg, false);
4053  }
4054  if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
4055  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4056  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4057  .add(Src2);
4058  Src2.ChangeToRegister(Reg, false);
4059  }
4060  }
4061 
4062  // Find the one SGPR operand we are allowed to use.
4063  int ConstantBusLimit = ST.getConstantBusLimit(Opc);
4064  int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
4065  SmallDenseSet<unsigned> SGPRsUsed;
4066  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
4067  if (SGPRReg != AMDGPU::NoRegister) {
4068  SGPRsUsed.insert(SGPRReg);
4069  --ConstantBusLimit;
4070  }
4071 
4072  for (unsigned i = 0; i < 3; ++i) {
4073  int Idx = VOP3Idx[i];
4074  if (Idx == -1)
4075  break;
4076  MachineOperand &MO = MI.getOperand(Idx);
4077 
4078  if (!MO.isReg()) {
4079  if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx]))
4080  continue;
4081 
4082  if (LiteralLimit > 0 && ConstantBusLimit > 0) {
4083  --LiteralLimit;
4084  --ConstantBusLimit;
4085  continue;
4086  }
4087 
4088  --LiteralLimit;
4089  --ConstantBusLimit;
4090  legalizeOpWithMove(MI, Idx);
4091  continue;
4092  }
4093 
4094  if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) &&
4095  !isOperandLegal(MI, Idx, &MO)) {
4096  legalizeOpWithMove(MI, Idx);
4097  continue;
4098  }
4099 
4100  if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
4101  continue; // VGPRs are legal
4102 
4103  // We can use one SGPR in each VOP3 instruction prior to GFX10
4104  // and two starting from GFX10.
4105  if (SGPRsUsed.count(MO.getReg()))
4106  continue;
4107  if (ConstantBusLimit > 0) {
4108  SGPRsUsed.insert(MO.getReg());
4109  --ConstantBusLimit;
4110  continue;
4111  }
4112 
4113  // If we make it this far, then the operand is not legal and we must
4114  // legalize it.
4115  legalizeOpWithMove(MI, Idx);
4116  }
4117 }
4118 
4120  MachineRegisterInfo &MRI) const {
4121  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
4122  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
4123  Register DstReg = MRI.createVirtualRegister(SRC);
4124  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
4125 
4126  if (RI.hasAGPRs(VRC)) {
4127  VRC = RI.getEquivalentVGPRClass(VRC);
4128  Register NewSrcReg = MRI.createVirtualRegister(VRC);
4129  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
4130  get(TargetOpcode::COPY), NewSrcReg)
4131  .addReg(SrcReg);
4132  SrcReg = NewSrcReg;
4133  }
4134 
4135  if (SubRegs == 1) {
4136  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
4137  get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
4138  .addReg(SrcReg);
4139  return DstReg;
4140  }
4141 
4143  for (unsigned i = 0; i < SubRegs; ++i) {
4144  Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4145  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
4146  get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
4147  .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
4148  SRegs.push_back(SGPR);
4149  }
4150 
4151  MachineInstrBuilder MIB =
4152  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
4153  get(AMDGPU::REG_SEQUENCE), DstReg);
4154  for (unsigned i = 0; i < SubRegs; ++i) {
4155  MIB.addReg(SRegs[i]);
4156  MIB.addImm(RI.getSubRegFromChannel(i));
4157  }
4158  return DstReg;
4159 }
4160 
4162  MachineInstr &MI) const {
4163 
4164  // If the pointer is store in VGPRs, then we need to move them to
4165  // SGPRs using v_readfirstlane. This is safe because we only select
4166  // loads with uniform pointers to SMRD instruction so we know the
4167  // pointer value is uniform.
4168  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
4169  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
4170  unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
4171  SBase->setReg(SGPR);
4172  }
4173  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
4174  if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
4175  unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
4176  SOff->setReg(SGPR);
4177  }
4178 }
4179 
4182  const TargetRegisterClass *DstRC,
4183  MachineOperand &Op,
4185  const DebugLoc &DL) const {
4186  Register OpReg = Op.getReg();
4187  unsigned OpSubReg = Op.getSubReg();
4188 
4189  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
4190  RI.getRegClassForReg(MRI, OpReg), OpSubReg);
4191 
4192  // Check if operand is already the correct register class.
4193  if (DstRC == OpRC)
4194  return;
4195 
4196  Register DstReg = MRI.createVirtualRegister(DstRC);
4197  MachineInstr *Copy =
4198  BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
4199 
4200  Op.setReg(DstReg);
4201  Op.setSubReg(0);
4202 
4203  MachineInstr *Def = MRI.getVRegDef(OpReg);
4204  if (!Def)
4205  return;
4206 
4207  // Try to eliminate the copy if it is copying an immediate value.
4208  if (Def->isMoveImmediate())
4209  FoldImmediate(*Copy, *Def, OpReg, &MRI);
4210 }
4211 
4212 // Emit the actual waterfall loop, executing the wrapped instruction for each
4213 // unique value of \p Rsrc across all lanes. In the best case we execute 1
4214 // iteration, in the worst case we execute 64 (once per lane).
4215 static void
4217  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4218  const DebugLoc &DL, MachineOperand &Rsrc) {
4219  MachineFunction &MF = *OrigBB.getParent();
4220  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
4221  const SIRegisterInfo *TRI = ST.getRegisterInfo();
4222  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4223  unsigned SaveExecOpc =
4224  ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
4225  unsigned XorTermOpc =
4226  ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
4227  unsigned AndOpc =
4228  ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
4229  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4230 
4231  MachineBasicBlock::iterator I = LoopBB.begin();
4232 
4233  Register VRsrc = Rsrc.getReg();
4234  unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
4235 
4236  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4237  Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
4238  Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
4239  Register AndCond = MRI.createVirtualRegister(BoolXExecRC);
4240  Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4241  Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4242  Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4243  Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4244  Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
4245 
4246  // Beginning of the loop, read the next Rsrc variant.
4247  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
4248  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
4249  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
4250  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
4251  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
4252  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
4253  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
4254  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
4255 
4256  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
4257  .addReg(SRsrcSub0)
4258  .addImm(AMDGPU::sub0)
4259  .addReg(SRsrcSub1)
4260  .addImm(AMDGPU::sub1)
4261  .addReg(SRsrcSub2)
4262  .addImm(AMDGPU::sub2)
4263  .addReg(SRsrcSub3)
4264  .addImm(AMDGPU::sub3);
4265 
4266  // Update Rsrc operand to use the SGPR Rsrc.
4267  Rsrc.setReg(SRsrc);
4268  Rsrc.setIsKill(true);
4269 
4270  // Identify all lanes with identical Rsrc operands in their VGPRs.
4271  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
4272  .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
4273  .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
4274  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
4275  .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
4276  .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
4277  BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond)
4278  .addReg(CondReg0)
4279  .addReg(CondReg1);
4280 
4281  MRI.setSimpleHint(SaveExec, AndCond);
4282 
4283  // Update EXEC to matching lanes, saving original to SaveExec.
4284  BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
4285  .addReg(AndCond, RegState::Kill);
4286 
4287  // The original instruction is here; we insert the terminators after it.
4288  I = LoopBB.end();
4289 
4290  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4291  BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
4292  .addReg(Exec)
4293  .addReg(SaveExec);
4294  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
4295 }
4296 
4297 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
4298 // with SGPRs by iterating over all unique values across all lanes.
4300  MachineOperand &Rsrc, MachineDominatorTree *MDT) {
4301  MachineBasicBlock &MBB = *MI.getParent();
4302  MachineFunction &MF = *MBB.getParent();
4303  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
4304  const SIRegisterInfo *TRI = ST.getRegisterInfo();
4307  const DebugLoc &DL = MI.getDebugLoc();
4308  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4309  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4310  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4311 
4312  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4313 
4314  // Save the EXEC mask
4315  BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
4316 
4317  // Killed uses in the instruction we are waterfalling around will be
4318  // incorrect due to the added control-flow.
4319  for (auto &MO : MI.uses()) {
4320  if (MO.isReg() && MO.isUse()) {
4321  MRI.clearKillFlags(MO.getReg());
4322  }
4323  }
4324 
4325  // To insert the loop we need to split the block. Move everything after this
4326  // point to a new block, and insert a new empty block between the two.
4328  MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
4329  MachineFunction::iterator MBBI(MBB);
4330  ++MBBI;
4331 
4332  MF.insert(MBBI, LoopBB);
4333  MF.insert(MBBI, RemainderBB);
4334 
4335  LoopBB->addSuccessor(LoopBB);
4336  LoopBB->addSuccessor(RemainderBB);
4337 
4338  // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
4340  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4341  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4342  LoopBB->splice(LoopBB->begin(), &MBB, J);
4343 
4344  MBB.addSuccessor(LoopBB);
4345 
4346  // Update dominators. We know that MBB immediately dominates LoopBB, that
4347  // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
4348  // dominates all of the successors transferred to it from MBB that MBB used
4349  // to dominate.
4350  if (MDT) {
4351  MDT->addNewBlock(LoopBB, &MBB);
4352  MDT->addNewBlock(RemainderBB, LoopBB);
4353  for (auto &Succ : RemainderBB->successors()) {
4354  if (MDT->dominates(&MBB, Succ)) {
4355  MDT->changeImmediateDominator(Succ, RemainderBB);
4356  }
4357  }
4358  }
4359 
4360  emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
4361 
4362  // Restore the EXEC mask
4363  MachineBasicBlock::iterator First = RemainderBB->begin();
4364  BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
4365 }
4366 
4367 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
4368 static std::tuple<unsigned, unsigned>
4370  MachineBasicBlock &MBB = *MI.getParent();
4371  MachineFunction &MF = *MBB.getParent();
4373 
4374  // Extract the ptr from the resource descriptor.
4375  unsigned RsrcPtr =
4376  TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
4377  AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
4378 
4379  // Create an empty resource descriptor
4380  Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4381  Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4382  Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4383  Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
4384  uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
4385 
4386  // Zero64 = 0
4387  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
4388  .addImm(0);
4389 
4390  // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
4391  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
4392  .addImm(RsrcDataFormat & 0xFFFFFFFF);
4393 
4394  // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
4395  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
4396  .addImm(RsrcDataFormat >> 32);
4397 
4398  // NewSRsrc = {Zero64, SRsrcFormat}
4399  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
4400  .addReg(Zero64)
4401  .addImm(AMDGPU::sub0_sub1)
4402  .addReg(SRsrcFormatLo)
4403  .addImm(AMDGPU::sub2)
4404  .addReg(SRsrcFormatHi)
4405  .addImm(AMDGPU::sub3);
4406 
4407  return std::make_tuple(RsrcPtr, NewSRsrc);
4408 }
4409 
4411  MachineDominatorTree *MDT) const {
4412  MachineFunction &MF = *MI.getParent()->getParent();
4414 
4415  // Legalize VOP2
4416  if (isVOP2(MI) || isVOPC(MI)) {
4417  legalizeOperandsVOP2(MRI, MI);
4418  return;
4419  }
4420 
4421  // Legalize VOP3
4422  if (isVOP3(MI)) {
4423  legalizeOperandsVOP3(MRI, MI);
4424  return;
4425  }
4426 
4427  // Legalize SMRD
4428  if (isSMRD(MI)) {
4429  legalizeOperandsSMRD(MRI, MI);
4430  return;
4431  }
4432 
4433  // Legalize REG_SEQUENCE and PHI
4434  // The register class of the operands much be the same type as the register
4435  // class of the output.
4436  if (MI.getOpcode() == AMDGPU::PHI) {
4437  const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
4438  for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
4439  if (!MI.getOperand(i).isReg() ||
4441  continue;
4442  const TargetRegisterClass *OpRC =
4443  MRI.getRegClass(MI.getOperand(i).getReg());
4444  if (RI.hasVectorRegisters(OpRC)) {
4445  VRC = OpRC;
4446  } else {
4447  SRC = OpRC;
4448  }
4449  }
4450 
4451  // If any of the operands are VGPR registers, then they all most be
4452  // otherwise we will create illegal VGPR->SGPR copies when legalizing
4453  // them.
4454  if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
4455  if (!VRC) {
4456  assert(SRC);
4457  VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC)
4458  : RI.getEquivalentVGPRClass(SRC);
4459  }
4460  RC = VRC;
4461  } else {
4462  RC = SRC;
4463  }
4464 
4465  // Update all the operands so they have the same type.
4466  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4467  MachineOperand &Op = MI.getOperand(I);
4468  if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
4469  continue;
4470 
4471  // MI is a PHI instruction.
4472  MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
4473  MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
4474 
4475  // Avoid creating no-op copies with the same src and dst reg class. These
4476  // confuse some of the machine passes.
4477  legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
4478  }
4479  }
4480 
4481  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
4482  // VGPR dest type and SGPR sources, insert copies so all operands are
4483  // VGPRs. This seems to help operand folding / the register coalescer.
4484  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
4485  MachineBasicBlock *MBB = MI.getParent();
4486  const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
4487  if (RI.hasVGPRs(DstRC)) {
4488  // Update all the operands so they are VGPR register classes. These may
4489  // not be the same register class because REG_SEQUENCE supports mixing
4490  // subregister index types e.g. sub0_sub1 + sub2 + sub3
4491  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4492  MachineOperand &Op = MI.getOperand(I);
4493  if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
4494  continue;
4495 
4496  const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
4497  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
4498  if (VRC == OpRC)
4499  continue;
4500 
4501  legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
4502  Op.setIsKill();
4503  }
4504  }
4505 
4506  return;
4507  }
4508 
4509  // Legalize INSERT_SUBREG
4510  // src0 must have the same register class as dst
4511  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
4512  Register Dst = MI.getOperand(0).getReg();
4513  Register Src0 = MI.getOperand(1).getReg();
4514  const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
4515  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
4516  if (DstRC != Src0RC) {
4517  MachineBasicBlock *MBB = MI.getParent();
4518  MachineOperand &Op = MI.getOperand(1);
4519  legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
4520  }
4521  return;
4522  }
4523 
4524  // Legalize SI_INIT_M0
4525  if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
4526  MachineOperand &Src = MI.getOperand(0);
4527  if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
4528  Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
4529  return;
4530  }
4531 
4532  // Legalize MIMG and MUBUF/MTBUF for shaders.
4533  //
4534  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
4535  // scratch memory access. In both cases, the legalization never involves
4536  // conversion to the addr64 form.
4537  if (isMIMG(MI) ||
4539  (isMUBUF(MI) || isMTBUF(MI)))) {
4540  MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
4541  if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
4542  unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
4543  SRsrc->setReg(SGPR);
4544  }
4545 
4546  MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
4547  if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
4548  unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
4549  SSamp->setReg(SGPR);
4550  }
4551  return;
4552  }
4553 
4554  // Legalize MUBUF* instructions.
4555  int RsrcIdx =
4556  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
4557  if (RsrcIdx != -1) {
4558  // We have an MUBUF instruction
4559  MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
4560  unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
4561  if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
4562  RI.getRegClass(RsrcRC))) {
4563  // The operands are legal.
4564  // FIXME: We may need to legalize operands besided srsrc.
4565  return;
4566  }
4567 
4568  // Legalize a VGPR Rsrc.
4569  //
4570  // If the instruction is _ADDR64, we can avoid a waterfall by extracting
4571  // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
4572  // a zero-value SRsrc.
4573  //
4574  // If the instruction is _OFFSET (both idxen and offen disabled), and we
4575  // support ADDR64 instructions, we can convert to ADDR64 and do the same as
4576  // above.
4577  //
4578  // Otherwise we are on non-ADDR64 hardware, and/or we have
4579  // idxen/offen/bothen and we fall back to a waterfall loop.
4580 
4581  MachineBasicBlock &MBB = *MI.getParent();
4582 
4583  MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4584  if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
4585  // This is already an ADDR64 instruction so we need to add the pointer
4586  // extracted from the resource descriptor to the current value of VAddr.
4587  Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4588  Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4589  Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4590 
4591  const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4592  Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
4593  Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
4594 
4595  unsigned RsrcPtr, NewSRsrc;
4596  std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4597 
4598  // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
4599  const DebugLoc &DL = MI.getDebugLoc();
4600  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo)
4601  .addDef(CondReg0)
4602  .addReg(RsrcPtr, 0, AMDGPU::sub0)
4603  .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
4604  .addImm(0);
4605 
4606  // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
4607  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
4608  .addDef(CondReg1, RegState::Dead)
4609  .addReg(RsrcPtr, 0, AMDGPU::sub1)
4610  .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
4611  .addReg(CondReg0, RegState::Kill)
4612  .addImm(0);
4613 
4614  // NewVaddr = {NewVaddrHi, NewVaddrLo}
4615  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
4616  .addReg(NewVAddrLo)
4617  .addImm(AMDGPU::sub0)
4618  .addReg(NewVAddrHi)
4619  .addImm(AMDGPU::sub1);
4620 
4621  VAddr->setReg(NewVAddr);
4622  Rsrc->setReg(NewSRsrc);
4623  } else if (!VAddr && ST.hasAddr64()) {
4624  // This instructions is the _OFFSET variant, so we need to convert it to
4625  // ADDR64.
4626  assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
4628  "FIXME: Need to emit flat atomics here");
4629 
4630  unsigned RsrcPtr, NewSRsrc;
4631  std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4632 
4633  Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4634  MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
4635  MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4636  MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
4637  unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
4638 
4639  // Atomics rith return have have an additional tied operand and are
4640  // missing some of the special bits.
4641  MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
4642  MachineInstr *Addr64;
4643 
4644  if (!VDataIn) {
4645  // Regular buffer load / store.
4646  MachineInstrBuilder MIB =
4647  BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4648  .add(*VData)
4649  .addReg(NewVAddr)
4650  .addReg(NewSRsrc)
4651  .add(*SOffset)
4652  .add(*Offset);
4653 
4654  // Atomics do not have this operand.
4655  if (const MachineOperand *GLC =
4656  getNamedOperand(MI, AMDGPU::OpName::glc)) {
4657  MIB.addImm(GLC->getImm());
4658  }
4659  if (const MachineOperand *DLC =
4660  getNamedOperand(MI, AMDGPU::OpName::dlc)) {
4661  MIB.addImm(DLC->getImm());
4662  }
4663 
4664  MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
4665 
4666  if (const MachineOperand *TFE =
4667  getNamedOperand(MI, AMDGPU::OpName::tfe)) {
4668  MIB.addImm(TFE->getImm());
4669  }
4670 
4671  MIB.cloneMemRefs(MI);
4672  Addr64 = MIB;
4673  } else {
4674  // Atomics with return.
4675  Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4676  .add(*VData)
4677  .add(*VDataIn)
4678  .addReg(NewVAddr)
4679  .addReg(NewSRsrc)
4680  .add(*SOffset)
4681  .add(*Offset)
4682  .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
4683  .cloneMemRefs(MI);
4684  }
4685 
4686  MI.removeFromParent();
4687 
4688  // NewVaddr = {NewVaddrHi, NewVaddrLo}
4689  BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
4690  NewVAddr)
4691  .addReg(RsrcPtr, 0, AMDGPU::sub0)
4692  .addImm(AMDGPU::sub0)
4693  .addReg(RsrcPtr, 0, AMDGPU::sub1)
4694  .addImm(AMDGPU::sub1);
4695  } else {
4696  // This is another variant; legalize Rsrc with waterfall loop from VGPRs
4697  // to SGPRs.
4698  loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
4699  }
4700  }
4701 }
4702 
4704  MachineDominatorTree *MDT) const {
4705  SetVectorType Worklist;
4706  Worklist.insert(&TopInst);
4707 
4708  while (!Worklist.empty()) {
4709  MachineInstr &Inst = *Worklist.pop_back_val();
4710  MachineBasicBlock *MBB = Inst.getParent();
4712 
4713  unsigned Opcode = Inst.getOpcode();
4714  unsigned NewOpcode = getVALUOp(Inst);
4715 
4716  // Handle some special cases
4717  switch (Opcode) {
4718  default:
4719  break;
4720  case AMDGPU::S_ADD_U64_PSEUDO:
4721  case AMDGPU::S_SUB_U64_PSEUDO:
4722  splitScalar64BitAddSub(Worklist, Inst, MDT);
4723  Inst.eraseFromParent();
4724  continue;
4725  case AMDGPU::S_ADD_I32:
4726  case AMDGPU::S_SUB_I32:
4727  // FIXME: The u32 versions currently selected use the carry.
4728  if (moveScalarAddSub(Worklist, Inst, MDT))
4729  continue;
4730 
4731  // Default handling
4732  break;
4733  case AMDGPU::S_AND_B64:
4734  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
4735  Inst.eraseFromParent();
4736  continue;
4737 
4738  case AMDGPU::S_OR_B64:
4739  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
4740  Inst.eraseFromParent();
4741  continue;
4742 
4743  case AMDGPU::S_XOR_B64:
4744  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
4745  Inst.eraseFromParent();
4746  continue;
4747 
4748  case AMDGPU::S_NAND_B64:
4749  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
4750  Inst.eraseFromParent();
4751  continue;
4752 
4753  case AMDGPU::S_NOR_B64:
4754  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
4755  Inst.eraseFromParent();
4756  continue;
4757 
4758  case AMDGPU::S_XNOR_B64:
4759  if (ST.hasDLInsts())
4760  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
4761  else
4762  splitScalar64BitXnor(Worklist, Inst, MDT);
4763  Inst.eraseFromParent();
4764  continue;
4765 
4766  case AMDGPU::S_ANDN2_B64:
4767  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
4768  Inst.eraseFromParent();
4769  continue;
4770 
4771  case AMDGPU::S_ORN2_B64:
4772  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
4773  Inst.eraseFromParent();
4774  continue;
4775 
4776  case AMDGPU::S_NOT_B64:
4777  splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
4778  Inst.eraseFromParent();
4779  continue;
4780 
4781  case AMDGPU::S_BCNT1_I32_B64:
4782  splitScalar64BitBCNT(Worklist, Inst);
4783  Inst.eraseFromParent();
4784  continue;
4785 
4786  case AMDGPU::S_BFE_I64:
4787  splitScalar64BitBFE(Worklist, Inst);
4788  Inst.eraseFromParent();
4789  continue;
4790 
4791  case AMDGPU::S_LSHL_B32:
4792  if (ST.hasOnlyRevVALUShifts()) {
4793  NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
4794  swapOperands(Inst);
4795  }
4796  break;
4797  case AMDGPU::S_ASHR_I32:
4798  if (ST.hasOnlyRevVALUShifts()) {
4799  NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
4800  swapOperands(Inst);
4801  }
4802  break;
4803  case AMDGPU::S_LSHR_B32:
4804  if (ST.hasOnlyRevVALUShifts()) {
4805  NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
4806  swapOperands(Inst);
4807  }
4808  break;
4809  case AMDGPU::S_LSHL_B64:
4810  if (ST.hasOnlyRevVALUShifts()) {
4811  NewOpcode = AMDGPU::V_LSHLREV_B64;
4812  swapOperands(Inst);
4813  }
4814  break;
4815  case AMDGPU::S_ASHR_I64:
4816  if (ST.hasOnlyRevVALUShifts()) {
4817  NewOpcode = AMDGPU::V_ASHRREV_I64;
4818  swapOperands(Inst);
4819  }
4820  break;
4821  case AMDGPU::S_LSHR_B64:
4822  if (ST.hasOnlyRevVALUShifts()) {
4823  NewOpcode = AMDGPU::V_LSHRREV_B64;
4824  swapOperands(Inst);
4825  }
4826  break;
4827 
4828  case AMDGPU::S_ABS_I32:
4829  lowerScalarAbs(Worklist, Inst);
4830  Inst.eraseFromParent();
4831  continue;
4832 
4833  case AMDGPU::S_CBRANCH_SCC0:
4834  case AMDGPU::S_CBRANCH_SCC1:
4835  // Clear unused bits of vcc
4836  if (ST.isWave32())
4837  BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32),
4838  AMDGPU::VCC_LO)
4839  .addReg(AMDGPU::EXEC_LO)
4840  .addReg(AMDGPU::VCC_LO);
4841  else
4842  BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
4843  AMDGPU::VCC)
4844  .addReg(AMDGPU::EXEC)
4845  .addReg(AMDGPU::VCC);
4846  break;
4847 
4848  case AMDGPU::S_BFE_U64:
4849  case AMDGPU::S_BFM_B64:
4850  llvm_unreachable("Moving this op to VALU not implemented");
4851 
4852  case AMDGPU::S_PACK_LL_B32_B16:
4853  case AMDGPU::S_PACK_LH_B32_B16:
4854  case AMDGPU::S_PACK_HH_B32_B16:
4855  movePackToVALU(Worklist, MRI, Inst);
4856  Inst.eraseFromParent();
4857  continue;
4858 
4859  case AMDGPU::S_XNOR_B32:
4860  lowerScalarXnor(Worklist, Inst);
4861  Inst.eraseFromParent();
4862  continue;
4863 
4864  case AMDGPU::S_NAND_B32:
4865  splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
4866  Inst.eraseFromParent();
4867  continue;
4868 
4869  case AMDGPU::S_NOR_B32:
4870  splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
4871  Inst.eraseFromParent();
4872  continue;
4873 
4874  case AMDGPU::S_ANDN2_B32:
4875  splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
4876  Inst.eraseFromParent();
4877  continue;
4878 
4879  case AMDGPU::S_ORN2_B32:
4880  splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
4881  Inst.eraseFromParent();
4882  continue;
4883  }
4884 
4885  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
4886  // We cannot move this instruction to the VALU, so we should try to
4887  // legalize its operands instead.
4888  legalizeOperands(Inst, MDT);
4889  continue;
4890  }
4891 
4892  // Use the new VALU Opcode.
4893  const MCInstrDesc &NewDesc = get(NewOpcode);
4894  Inst.setDesc(NewDesc);
4895 
4896  // Remove any references to SCC. Vector instructions can't read from it, and
4897  // We're just about to add the implicit use / defs of VCC, and we don't want
4898  // both.
4899  for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
4900  MachineOperand &Op = Inst.getOperand(i);
4901  if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
4902  // Only propagate through live-def of SCC.
4903  if (Op.isDef() && !Op.isDead())
4904  addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
4905  Inst.RemoveOperand(i);
4906  }
4907  }
4908 
4909  if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
4910  // We are converting these to a BFE, so we need to add the missing
4911  // operands for the size and offset.
4912  unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
4915 
4916  } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
4917  // The VALU version adds the second operand to the result, so insert an
4918  // extra 0 operand.
4920  }
4921 
4923  fixImplicitOperands(Inst);
4924 
4925  if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
4926  const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
4927  // If we need to move this to VGPRs, we need to unpack the second operand
4928  // back into the 2 separate ones for bit offset and width.
4929  assert(OffsetWidthOp.isImm() &&
4930  "Scalar BFE is only implemented for constant width and offset");
4931  uint32_t Imm = OffsetWidthOp.getImm();
4932 
4933  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4934  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4935  Inst.RemoveOperand(2); // Remove old immediate.
4936  Inst.addOperand(MachineOperand::CreateImm(Offset));
4937  Inst.addOperand(MachineOperand::CreateImm(BitWidth));
4938  }
4939 
4940  bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
4941  unsigned NewDstReg = AMDGPU::NoRegister;
4942  if (HasDst) {
4943  Register DstReg = Inst.getOperand(0).getReg();
4944  if (Register::isPhysicalRegister(DstReg))
4945  continue;
4946 
4947  // Update the destination register class.
4948  const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
4949  if (!NewDstRC)
4950  continue;
4951 
4952  if (Inst.isCopy() &&
4954  NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
4955  // Instead of creating a copy where src and dst are the same register
4956  // class, we just replace all uses of dst with src. These kinds of
4957  // copies interfere with the heuristics MachineSink uses to decide
4958  // whether or not to split a critical edge. Since the pass assumes
4959  // that copies will end up as machine instructions and not be
4960  // eliminated.
4961  addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
4962  MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
4963  MRI.clearKillFlags(Inst.getOperand(1).getReg());
4964  Inst.getOperand(0).setReg(DstReg);
4965 
4966  // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
4967  // these are deleted later, but at -O0 it would leave a suspicious
4968  // looking illegal copy of an undef register.
4969  for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
4970  Inst.RemoveOperand(I);
4971  Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
4972  continue;
4973  }
4974 
4975  NewDstReg = MRI.createVirtualRegister(NewDstRC);
4976  MRI.replaceRegWith(DstReg, NewDstReg);
4977  }
4978 
4979  // Legalize the operands
4980  legalizeOperands(Inst, MDT);
4981 
4982  if (HasDst)
4983  addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
4984  }
4985 }
4986 
4987 // Add/sub require special handling to deal with carry outs.
4988 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
4989  MachineDominatorTree *MDT) const {
4990  if (ST.hasAddNoCarry()) {
4991  // Assume there is no user of scc since we don't select this in that case.
4992  // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
4993  // is used.
4994 
4995  MachineBasicBlock &MBB = *Inst.getParent();
4997 
4998  Register OldDstReg = Inst.getOperand(0).getReg();
4999  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5000 
5001  unsigned Opc = Inst.getOpcode();
5002  assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
5003 
5004  unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
5005  AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
5006 
5007  assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
5008  Inst.RemoveOperand(3);
5009 
5010  Inst.setDesc(get(NewOpc));
5011  Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
5012  Inst.addImplicitDefUseOperands(*MBB.getParent());
5013  MRI.replaceRegWith(OldDstReg, ResultReg);
5014  legalizeOperands(Inst, MDT);
5015 
5016  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
5017  return true;
5018  }
5019 
5020  return false;
5021 }
5022 
5023 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
5024  MachineInstr &Inst) const {
5025  MachineBasicBlock &MBB = *Inst.getParent();
5027  MachineBasicBlock::iterator MII = Inst;
5028  DebugLoc DL = Inst.getDebugLoc();
5029 
5030  MachineOperand &Dest = Inst.getOperand(0);
5031  MachineOperand &Src = Inst.getOperand(1);
5032  Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5033  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5034 
5035  unsigned SubOp = ST.hasAddNoCarry() ?
5036  AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
5037 
5038  BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
5039  .addImm(0)
5040  .addReg(Src.getReg());
5041 
5042  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
5043  .addReg(Src.getReg())
5044  .addReg(TmpReg);
5045 
5046  MRI.replaceRegWith(Dest.getReg(), ResultReg);
5047  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
5048 }
5049 
5050 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
5051  MachineInstr &Inst) const {
5052  MachineBasicBlock &MBB = *Inst.getParent();
5054  MachineBasicBlock::iterator MII = Inst;
5055  const DebugLoc &DL = Inst.getDebugLoc();
5056 
5057  MachineOperand &Dest = Inst.getOperand(0);
5058  MachineOperand &Src0 = Inst.getOperand(1);
5059  MachineOperand &Src1 = Inst.getOperand(2);
5060 
5061  if (ST.hasDLInsts()) {
5062  Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5063  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
5064  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
5065 
5066  BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
5067  .add(Src0)
5068  .add(Src1);
5069 
5070  MRI.replaceRegWith(Dest.getReg(), NewDest);
5071  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
5072  } else {
5073  // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
5074  // invert either source and then perform the XOR. If either source is a
5075  // scalar register, then we can leave the inversion on the scalar unit to
5076  // acheive a better distrubution of scalar and vector instructions.
5077  bool Src0IsSGPR = Src0.isReg() &&
5078  RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
5079  bool Src1IsSGPR = Src1.isReg() &&
5080  RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
5081  MachineInstr *Xor;
5082  Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5083  Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5084 
5085  // Build a pair of scalar instructions and add them to the work list.
5086  // The next iteration over the work list will lower these to the vector
5087  // unit as necessary.
5088  if (Src0IsSGPR) {
5089  BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
5090  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
5091  .addReg(Temp)
5092  .add(Src1);
5093  } else if (Src1IsSGPR) {
5094  BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
5095  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
5096  .add(Src0)
5097  .addReg(Temp);
5098  } else {
5099  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
5100  .add(Src0)
5101  .add(Src1);
5102  MachineInstr *Not =
5103  BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
5104  Worklist.insert(Not);
5105  }
5106 
5107  MRI.replaceRegWith(Dest.getReg(), NewDest);
5108 
5109  Worklist.insert(Xor);
5110 
5111  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
5112  }
5113 }
5114 
5115 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
5116  MachineInstr &Inst,
5117  unsigned Opcode) const {
5118  MachineBasicBlock &MBB = *Inst.getParent();
5120  MachineBasicBlock::iterator MII = Inst;
5121  const DebugLoc &DL = Inst.getDebugLoc();
5122 
5123  MachineOperand &Dest = Inst.getOperand(0);
5124  MachineOperand &Src0 = Inst.getOperand(1);
5125  MachineOperand &Src1 = Inst.getOperand(2);
5126 
5127  Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5128  Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5129 
5130  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
5131  .add(Src0)
5132  .add(Src1);
5133 
5134  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
5135  .addReg(Interm);
5136 
5137  Worklist.insert(&Op);
5138  Worklist.insert(&Not);
5139 
5140  MRI.replaceRegWith(Dest.getReg(), NewDest);
5141  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
5142 }
5143 
5144 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
5145  MachineInstr &Inst,
5146  unsigned Opcode) const {
5147  MachineBasicBlock &MBB = *Inst.getParent();
5149  MachineBasicBlock::iterator MII = Inst;
5150  const DebugLoc &DL = Inst.getDebugLoc();
5151 
5152  MachineOperand &Dest = Inst.getOperand(0);
5153  MachineOperand &Src0 = Inst.getOperand(1);
5154  MachineOperand &Src1 = Inst.getOperand(2);
5155 
5156  Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5157  Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5158 
5159  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
5160  .add(Src1);
5161 
5162  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
5163  .add(Src0)
5164  .addReg(Interm);
5165 
5166  Worklist.insert(&Not);
5167  Worklist.insert(&Op);
5168 
5169  MRI.replaceRegWith(Dest.getReg(), NewDest);
5170  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
5171 }
5172 
5173 void SIInstrInfo::splitScalar64BitUnaryOp(
5174  SetVectorType &Worklist, MachineInstr &Inst,
5175  unsigned Opcode) const {
5176  MachineBasicBlock &MBB = *Inst.getParent();
5178 
5179  MachineOperand &Dest = Inst.getOperand(0);
5180  MachineOperand &Src0 = Inst.getOperand(1);
5181  DebugLoc DL = Inst.getDebugLoc();
5182 
5183  MachineBasicBlock::iterator MII = Inst;
5184 
5185  const MCInstrDesc &InstDesc = get(Opcode);
5186  const TargetRegisterClass *Src0RC = Src0.isReg() ?
5187  MRI.getRegClass(Src0.getReg()) :
5188  &AMDGPU::SGPR_32RegClass;
5189 
5190  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
5191 
5192  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
5193  AMDGPU::sub0, Src0SubRC);
5194 
5195  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
5196  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
5197  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
5198 
5199  Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
5200  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
5201 
5202  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
5203  AMDGPU::sub1, Src0SubRC);
5204 
5205  Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
5206  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
5207 
5208  Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
5209  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
5210  .addReg(DestSub0)
5211  .addImm(AMDGPU::sub0)
5212  .addReg(DestSub1)
5213  .addImm(AMDGPU::sub1);
5214 
5215  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
5216 
5217  Worklist.insert(&LoHalf);
5218  Worklist.insert(&HiHalf);
5219 
5220  // We don't need to legalizeOperands here because for a single operand, src0
5221  // will support any kind of input.
5222 
5223  // Move all users of this moved value.
5224  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
5225 }
5226 
5227 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
5228  MachineInstr &Inst,
5229  MachineDominatorTree *MDT) const {
5230  bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5231 
5232  MachineBasicBlock &MBB = *Inst.getParent();
5234  const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5235 
5236  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
5237  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5238  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5239 
5240  Register CarryReg = MRI.createVirtualRegister(CarryRC);
5241  Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5242 
5243  MachineOperand &Dest = Inst.getOperand(0);
5244  MachineOperand &Src0 = Inst.getOperand(1);
5245  MachineOperand &Src1 = Inst.getOperand(2);
5246  const DebugLoc &DL = Inst.getDebugLoc();
5247  MachineBasicBlock::iterator MII = Inst;
5248 
5249  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
5250  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
5251  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
5252  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
5253 
5254  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
5255  AMDGPU::sub0, Src0SubRC);
5256  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
5257  AMDGPU::sub0, Src1SubRC);
5258 
5259 
5260  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
5261  AMDGPU::sub1, Src0SubRC);
5262  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
5263  AMDGPU::sub1, Src1SubRC);
5264 
5265  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
5266  MachineInstr *LoHalf =
5267  BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
5268  .addReg(CarryReg, RegState::Define)
5269  .add(SrcReg0Sub0)
5270  .add(SrcReg1Sub0)
5271  .addImm(0); // clamp bit
5272 
5273  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5274  MachineInstr *HiHalf =
5275  BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
5276  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5277  .add(SrcReg0Sub1)
5278  .add(SrcReg1Sub1)
5279  .addReg(CarryReg, RegState::Kill)
5280  .addImm(0); // clamp bit
5281 
5282  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
5283  .addReg(DestSub0)