LLVM  9.0.0svn
SIInstrInfo.cpp
Go to the documentation of this file.
1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Implementation of TargetInstrInfo.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIInstrInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "GCNHazardRecognizer.h"
18 #include "SIDefines.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/StringRef.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/IR/InlineAsm.h"
50 #include "llvm/IR/LLVMContext.h"
51 #include "llvm/MC/MCInstrDesc.h"
52 #include "llvm/Support/Casting.h"
54 #include "llvm/Support/Compiler.h"
59 #include <cassert>
60 #include <cstdint>
61 #include <iterator>
62 #include <utility>
63 
64 using namespace llvm;
65 
66 #define GET_INSTRINFO_CTOR_DTOR
67 #include "AMDGPUGenInstrInfo.inc"
68 
69 namespace llvm {
70 namespace AMDGPU {
71 #define GET_D16ImageDimIntrinsics_IMPL
72 #define GET_ImageDimIntrinsicTable_IMPL
73 #define GET_RsrcIntrinsics_IMPL
74 #include "AMDGPUGenSearchableTables.inc"
75 }
76 }
77 
78 
79 // Must be at least 4 to be able to branch over minimum unconditional branch
80 // code. This is only for making it possible to write reasonably small tests for
81 // long branches.
82 static cl::opt<unsigned>
83 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
84  cl::desc("Restrict range of branch instructions (DEBUG)"));
85 
87  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
88  RI(ST), ST(ST) {}
89 
90 //===----------------------------------------------------------------------===//
91 // TargetInstrInfo callbacks
92 //===----------------------------------------------------------------------===//
93 
94 static unsigned getNumOperandsNoGlue(SDNode *Node) {
95  unsigned N = Node->getNumOperands();
96  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
97  --N;
98  return N;
99 }
100 
102  SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
103  assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
104  return LastOp;
105 }
106 
107 /// Returns true if both nodes have the same value for the given
108 /// operand \p Op, or if both nodes do not have this operand.
109 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
110  unsigned Opc0 = N0->getMachineOpcode();
111  unsigned Opc1 = N1->getMachineOpcode();
112 
113  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
114  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
115 
116  if (Op0Idx == -1 && Op1Idx == -1)
117  return true;
118 
119 
120  if ((Op0Idx == -1 && Op1Idx != -1) ||
121  (Op1Idx == -1 && Op0Idx != -1))
122  return false;
123 
124  // getNamedOperandIdx returns the index for the MachineInstr's operands,
125  // which includes the result as the first operand. We are indexing into the
126  // MachineSDNode's operands, so we need to skip the result operand to get
127  // the real index.
128  --Op0Idx;
129  --Op1Idx;
130 
131  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
132 }
133 
135  AliasAnalysis *AA) const {
136  // TODO: The generic check fails for VALU instructions that should be
137  // rematerializable due to implicit reads of exec. We really want all of the
138  // generic logic for this except for this.
139  switch (MI.getOpcode()) {
140  case AMDGPU::V_MOV_B32_e32:
141  case AMDGPU::V_MOV_B32_e64:
142  case AMDGPU::V_MOV_B64_PSEUDO:
143  // No implicit operands.
144  return MI.getNumOperands() == MI.getDesc().getNumOperands();
145  default:
146  return false;
147  }
148 }
149 
151  int64_t &Offset0,
152  int64_t &Offset1) const {
153  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
154  return false;
155 
156  unsigned Opc0 = Load0->getMachineOpcode();
157  unsigned Opc1 = Load1->getMachineOpcode();
158 
159  // Make sure both are actually loads.
160  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
161  return false;
162 
163  if (isDS(Opc0) && isDS(Opc1)) {
164 
165  // FIXME: Handle this case:
166  if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
167  return false;
168 
169  // Check base reg.
170  if (Load0->getOperand(1) != Load1->getOperand(1))
171  return false;
172 
173  // Check chain.
174  if (findChainOperand(Load0) != findChainOperand(Load1))
175  return false;
176 
177  // Skip read2 / write2 variants for simplicity.
178  // TODO: We should report true if the used offsets are adjacent (excluded
179  // st64 versions).
180  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
181  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
182  return false;
183 
184  Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
185  Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
186  return true;
187  }
188 
189  if (isSMRD(Opc0) && isSMRD(Opc1)) {
190  // Skip time and cache invalidation instructions.
191  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
192  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
193  return false;
194 
196 
197  // Check base reg.
198  if (Load0->getOperand(0) != Load1->getOperand(0))
199  return false;
200 
201  const ConstantSDNode *Load0Offset =
202  dyn_cast<ConstantSDNode>(Load0->getOperand(1));
203  const ConstantSDNode *Load1Offset =
204  dyn_cast<ConstantSDNode>(Load1->getOperand(1));
205 
206  if (!Load0Offset || !Load1Offset)
207  return false;
208 
209  // Check chain.
210  if (findChainOperand(Load0) != findChainOperand(Load1))
211  return false;
212 
213  Offset0 = Load0Offset->getZExtValue();
214  Offset1 = Load1Offset->getZExtValue();
215  return true;
216  }
217 
218  // MUBUF and MTBUF can access the same addresses.
219  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
220 
221  // MUBUF and MTBUF have vaddr at different indices.
222  if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
223  findChainOperand(Load0) != findChainOperand(Load1) ||
224  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
225  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
226  return false;
227 
228  int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
229  int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
230 
231  if (OffIdx0 == -1 || OffIdx1 == -1)
232  return false;
233 
234  // getNamedOperandIdx returns the index for MachineInstrs. Since they
235  // inlcude the output in the operand list, but SDNodes don't, we need to
236  // subtract the index by one.
237  --OffIdx0;
238  --OffIdx1;
239 
240  SDValue Off0 = Load0->getOperand(OffIdx0);
241  SDValue Off1 = Load1->getOperand(OffIdx1);
242 
243  // The offset might be a FrameIndexSDNode.
244  if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
245  return false;
246 
247  Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
248  Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
249  return true;
250  }
251 
252  return false;
253 }
254 
255 static bool isStride64(unsigned Opc) {
256  switch (Opc) {
257  case AMDGPU::DS_READ2ST64_B32:
258  case AMDGPU::DS_READ2ST64_B64:
259  case AMDGPU::DS_WRITE2ST64_B32:
260  case AMDGPU::DS_WRITE2ST64_B64:
261  return true;
262  default:
263  return false;
264  }
265 }
266 
268  MachineOperand *&BaseOp,
269  int64_t &Offset,
270  const TargetRegisterInfo *TRI) const {
271  unsigned Opc = LdSt.getOpcode();
272 
273  if (isDS(LdSt)) {
274  const MachineOperand *OffsetImm =
275  getNamedOperand(LdSt, AMDGPU::OpName::offset);
276  if (OffsetImm) {
277  // Normal, single offset LDS instruction.
278  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
279  // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
280  // report that here?
281  if (!BaseOp)
282  return false;
283 
284  Offset = OffsetImm->getImm();
285  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
286  "operands of type register.");
287  return true;
288  }
289 
290  // The 2 offset instructions use offset0 and offset1 instead. We can treat
291  // these as a load with a single offset if the 2 offsets are consecutive. We
292  // will use this for some partially aligned loads.
293  const MachineOperand *Offset0Imm =
294  getNamedOperand(LdSt, AMDGPU::OpName::offset0);
295  const MachineOperand *Offset1Imm =
296  getNamedOperand(LdSt, AMDGPU::OpName::offset1);
297 
298  uint8_t Offset0 = Offset0Imm->getImm();
299  uint8_t Offset1 = Offset1Imm->getImm();
300 
301  if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
302  // Each of these offsets is in element sized units, so we need to convert
303  // to bytes of the individual reads.
304 
305  unsigned EltSize;
306  if (LdSt.mayLoad())
307  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
308  else {
309  assert(LdSt.mayStore());
310  int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
311  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
312  }
313 
314  if (isStride64(Opc))
315  EltSize *= 64;
316 
317  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
318  Offset = EltSize * Offset0;
319  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
320  "operands of type register.");
321  return true;
322  }
323 
324  return false;
325  }
326 
327  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
328  const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
329  if (SOffset && SOffset->isReg())
330  return false;
331 
332  MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
333  if (!AddrReg)
334  return false;
335 
336  const MachineOperand *OffsetImm =
337  getNamedOperand(LdSt, AMDGPU::OpName::offset);
338  BaseOp = AddrReg;
339  Offset = OffsetImm->getImm();
340 
341  if (SOffset) // soffset can be an inline immediate.
342  Offset += SOffset->getImm();
343 
344  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
345  "operands of type register.");
346  return true;
347  }
348 
349  if (isSMRD(LdSt)) {
350  const MachineOperand *OffsetImm =
351  getNamedOperand(LdSt, AMDGPU::OpName::offset);
352  if (!OffsetImm)
353  return false;
354 
355  MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
356  BaseOp = SBaseReg;
357  Offset = OffsetImm->getImm();
358  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
359  "operands of type register.");
360  return true;
361  }
362 
363  if (isFLAT(LdSt)) {
364  MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
365  if (VAddr) {
366  // Can't analyze 2 offsets.
367  if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
368  return false;
369 
370  BaseOp = VAddr;
371  } else {
372  // scratch instructions have either vaddr or saddr.
373  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
374  }
375 
376  Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
377  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
378  "operands of type register.");
379  return true;
380  }
381 
382  return false;
383 }
384 
385 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
386  const MachineOperand &BaseOp1,
387  const MachineInstr &MI2,
388  const MachineOperand &BaseOp2) {
389  // Support only base operands with base registers.
390  // Note: this could be extended to support FI operands.
391  if (!BaseOp1.isReg() || !BaseOp2.isReg())
392  return false;
393 
394  if (BaseOp1.isIdenticalTo(BaseOp2))
395  return true;
396 
397  if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
398  return false;
399 
400  auto MO1 = *MI1.memoperands_begin();
401  auto MO2 = *MI2.memoperands_begin();
402  if (MO1->getAddrSpace() != MO2->getAddrSpace())
403  return false;
404 
405  auto Base1 = MO1->getValue();
406  auto Base2 = MO2->getValue();
407  if (!Base1 || !Base2)
408  return false;
409  const MachineFunction &MF = *MI1.getParent()->getParent();
410  const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
411  Base1 = GetUnderlyingObject(Base1, DL);
412  Base2 = GetUnderlyingObject(Base1, DL);
413 
414  if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
415  return false;
416 
417  return Base1 == Base2;
418 }
419 
421  MachineOperand &BaseOp2,
422  unsigned NumLoads) const {
423  MachineInstr &FirstLdSt = *BaseOp1.getParent();
424  MachineInstr &SecondLdSt = *BaseOp2.getParent();
425 
426  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
427  return false;
428 
429  const MachineOperand *FirstDst = nullptr;
430  const MachineOperand *SecondDst = nullptr;
431 
432  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
433  (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
434  (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
435  const unsigned MaxGlobalLoadCluster = 6;
436  if (NumLoads > MaxGlobalLoadCluster)
437  return false;
438 
439  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
440  if (!FirstDst)
441  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
442  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
443  if (!SecondDst)
444  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
445  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
446  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
447  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
448  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
449  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
450  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
451  }
452 
453  if (!FirstDst || !SecondDst)
454  return false;
455 
456  // Try to limit clustering based on the total number of bytes loaded
457  // rather than the number of instructions. This is done to help reduce
458  // register pressure. The method used is somewhat inexact, though,
459  // because it assumes that all loads in the cluster will load the
460  // same number of bytes as FirstLdSt.
461 
462  // The unit of this value is bytes.
463  // FIXME: This needs finer tuning.
464  unsigned LoadClusterThreshold = 16;
465 
466  const MachineRegisterInfo &MRI =
467  FirstLdSt.getParent()->getParent()->getRegInfo();
468  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
469 
470  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
471 }
472 
473 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
474 // the first 16 loads will be interleaved with the stores, and the next 16 will
475 // be clustered as expected. It should really split into 2 16 store batches.
476 //
477 // Loads are clustered until this returns false, rather than trying to schedule
478 // groups of stores. This also means we have to deal with saying different
479 // address space loads should be clustered, and ones which might cause bank
480 // conflicts.
481 //
482 // This might be deprecated so it might not be worth that much effort to fix.
484  int64_t Offset0, int64_t Offset1,
485  unsigned NumLoads) const {
486  assert(Offset1 > Offset0 &&
487  "Second offset should be larger than first offset!");
488  // If we have less than 16 loads in a row, and the offsets are within 64
489  // bytes, then schedule together.
490 
491  // A cacheline is 64 bytes (for global memory).
492  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
493 }
494 
497  const DebugLoc &DL, unsigned DestReg,
498  unsigned SrcReg, bool KillSrc) {
499  MachineFunction *MF = MBB.getParent();
500  DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
501  "illegal SGPR to VGPR copy",
502  DL, DS_Error);
503  LLVMContext &C = MF->getFunction().getContext();
504  C.diagnose(IllegalCopy);
505 
506  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
507  .addReg(SrcReg, getKillRegState(KillSrc));
508 }
509 
512  const DebugLoc &DL, unsigned DestReg,
513  unsigned SrcReg, bool KillSrc) const {
514  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
515 
516  if (RC == &AMDGPU::VGPR_32RegClass) {
517  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
518  AMDGPU::SReg_32RegClass.contains(SrcReg));
519  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
520  .addReg(SrcReg, getKillRegState(KillSrc));
521  return;
522  }
523 
524  if (RC == &AMDGPU::SReg_32_XM0RegClass ||
525  RC == &AMDGPU::SReg_32RegClass) {
526  if (SrcReg == AMDGPU::SCC) {
527  BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
528  .addImm(-1)
529  .addImm(0);
530  return;
531  }
532 
533  if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
534  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
535  return;
536  }
537 
538  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
539  .addReg(SrcReg, getKillRegState(KillSrc));
540  return;
541  }
542 
543  if (RC == &AMDGPU::SReg_64RegClass) {
544  if (DestReg == AMDGPU::VCC) {
545  if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
546  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
547  .addReg(SrcReg, getKillRegState(KillSrc));
548  } else {
549  // FIXME: Hack until VReg_1 removed.
550  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
551  BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
552  .addImm(0)
553  .addReg(SrcReg, getKillRegState(KillSrc));
554  }
555 
556  return;
557  }
558 
559  if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
560  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
561  return;
562  }
563 
564  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
565  .addReg(SrcReg, getKillRegState(KillSrc));
566  return;
567  }
568 
569  if (DestReg == AMDGPU::SCC) {
570  assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
571  BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
572  .addReg(SrcReg, getKillRegState(KillSrc))
573  .addImm(0);
574  return;
575  }
576 
577  unsigned EltSize = 4;
578  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
579  if (RI.isSGPRClass(RC)) {
580  if (RI.getRegSizeInBits(*RC) > 32) {
581  Opcode = AMDGPU::S_MOV_B64;
582  EltSize = 8;
583  } else {
584  Opcode = AMDGPU::S_MOV_B32;
585  EltSize = 4;
586  }
587 
588  if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
589  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
590  return;
591  }
592  }
593 
594  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
595  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
596 
597  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
598  unsigned SubIdx;
599  if (Forward)
600  SubIdx = SubIndices[Idx];
601  else
602  SubIdx = SubIndices[SubIndices.size() - Idx - 1];
603 
604  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
605  get(Opcode), RI.getSubReg(DestReg, SubIdx));
606 
607  Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
608 
609  if (Idx == 0)
610  Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
611 
612  bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
613  Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
614  }
615 }
616 
617 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
618  int NewOpc;
619 
620  // Try to map original to commuted opcode
621  NewOpc = AMDGPU::getCommuteRev(Opcode);
622  if (NewOpc != -1)
623  // Check if the commuted (REV) opcode exists on the target.
624  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
625 
626  // Try to map commuted to original opcode
627  NewOpc = AMDGPU::getCommuteOrig(Opcode);
628  if (NewOpc != -1)
629  // Check if the original (non-REV) opcode exists on the target.
630  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
631 
632  return Opcode;
633 }
634 
637  const DebugLoc &DL, unsigned DestReg,
638  int64_t Value) const {
640  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
641  if (RegClass == &AMDGPU::SReg_32RegClass ||
642  RegClass == &AMDGPU::SGPR_32RegClass ||
643  RegClass == &AMDGPU::SReg_32_XM0RegClass ||
644  RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
645  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
646  .addImm(Value);
647  return;
648  }
649 
650  if (RegClass == &AMDGPU::SReg_64RegClass ||
651  RegClass == &AMDGPU::SGPR_64RegClass ||
652  RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
653  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
654  .addImm(Value);
655  return;
656  }
657 
658  if (RegClass == &AMDGPU::VGPR_32RegClass) {
659  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
660  .addImm(Value);
661  return;
662  }
663  if (RegClass == &AMDGPU::VReg_64RegClass) {
664  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
665  .addImm(Value);
666  return;
667  }
668 
669  unsigned EltSize = 4;
670  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
671  if (RI.isSGPRClass(RegClass)) {
672  if (RI.getRegSizeInBits(*RegClass) > 32) {
673  Opcode = AMDGPU::S_MOV_B64;
674  EltSize = 8;
675  } else {
676  Opcode = AMDGPU::S_MOV_B32;
677  EltSize = 4;
678  }
679  }
680 
681  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
682  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
683  int64_t IdxValue = Idx == 0 ? Value : 0;
684 
685  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
686  get(Opcode), RI.getSubReg(DestReg, Idx));
687  Builder.addImm(IdxValue);
688  }
689 }
690 
691 const TargetRegisterClass *
693  return &AMDGPU::VGPR_32RegClass;
694 }
695 
698  const DebugLoc &DL, unsigned DstReg,
700  unsigned TrueReg,
701  unsigned FalseReg) const {
703  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
704  "Not a VGPR32 reg");
705 
706  if (Cond.size() == 1) {
707  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
708  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
709  .add(Cond[0]);
710  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
711  .addReg(FalseReg)
712  .addReg(TrueReg)
713  .addReg(SReg);
714  } else if (Cond.size() == 2) {
715  assert(Cond[0].isImm() && "Cond[0] is not an immediate");
716  switch (Cond[0].getImm()) {
717  case SIInstrInfo::SCC_TRUE: {
718  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
719  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
720  .addImm(-1)
721  .addImm(0);
722  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
723  .addReg(FalseReg)
724  .addReg(TrueReg)
725  .addReg(SReg);
726  break;
727  }
728  case SIInstrInfo::SCC_FALSE: {
729  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
730  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
731  .addImm(0)
732  .addImm(-1);
733  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
734  .addReg(FalseReg)
735  .addReg(TrueReg)
736  .addReg(SReg);
737  break;
738  }
739  case SIInstrInfo::VCCNZ: {
740  MachineOperand RegOp = Cond[1];
741  RegOp.setImplicit(false);
742  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
743  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
744  .add(RegOp);
745  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
746  .addReg(FalseReg)
747  .addReg(TrueReg)
748  .addReg(SReg);
749  break;
750  }
751  case SIInstrInfo::VCCZ: {
752  MachineOperand RegOp = Cond[1];
753  RegOp.setImplicit(false);
754  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
755  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
756  .add(RegOp);
757  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
758  .addReg(TrueReg)
759  .addReg(FalseReg)
760  .addReg(SReg);
761  break;
762  }
763  case SIInstrInfo::EXECNZ: {
764  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
765  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
766  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
767  .addImm(0);
768  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
769  .addImm(-1)
770  .addImm(0);
771  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
772  .addReg(FalseReg)
773  .addReg(TrueReg)
774  .addReg(SReg);
775  break;
776  }
777  case SIInstrInfo::EXECZ: {
778  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
779  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
780  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
781  .addImm(0);
782  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
783  .addImm(0)
784  .addImm(-1);
785  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
786  .addReg(FalseReg)
787  .addReg(TrueReg)
788  .addReg(SReg);
789  llvm_unreachable("Unhandled branch predicate EXECZ");
790  break;
791  }
792  default:
793  llvm_unreachable("invalid branch predicate");
794  }
795  } else {
796  llvm_unreachable("Can only handle Cond size 1 or 2");
797  }
798 }
799 
802  const DebugLoc &DL,
803  unsigned SrcReg, int Value) const {
805  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
806  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
807  .addImm(Value)
808  .addReg(SrcReg);
809 
810  return Reg;
811 }
812 
815  const DebugLoc &DL,
816  unsigned SrcReg, int Value) const {
818  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
819  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
820  .addImm(Value)
821  .addReg(SrcReg);
822 
823  return Reg;
824 }
825 
826 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
827 
828  if (RI.getRegSizeInBits(*DstRC) == 32) {
829  return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
830  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
831  return AMDGPU::S_MOV_B64;
832  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
833  return AMDGPU::V_MOV_B64_PSEUDO;
834  }
835  return AMDGPU::COPY;
836 }
837 
838 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
839  switch (Size) {
840  case 4:
841  return AMDGPU::SI_SPILL_S32_SAVE;
842  case 8:
843  return AMDGPU::SI_SPILL_S64_SAVE;
844  case 16:
845  return AMDGPU::SI_SPILL_S128_SAVE;
846  case 32:
847  return AMDGPU::SI_SPILL_S256_SAVE;
848  case 64:
849  return AMDGPU::SI_SPILL_S512_SAVE;
850  default:
851  llvm_unreachable("unknown register size");
852  }
853 }
854 
855 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
856  switch (Size) {
857  case 4:
858  return AMDGPU::SI_SPILL_V32_SAVE;
859  case 8:
860  return AMDGPU::SI_SPILL_V64_SAVE;
861  case 12:
862  return AMDGPU::SI_SPILL_V96_SAVE;
863  case 16:
864  return AMDGPU::SI_SPILL_V128_SAVE;
865  case 32:
866  return AMDGPU::SI_SPILL_V256_SAVE;
867  case 64:
868  return AMDGPU::SI_SPILL_V512_SAVE;
869  default:
870  llvm_unreachable("unknown register size");
871  }
872 }
873 
876  unsigned SrcReg, bool isKill,
877  int FrameIndex,
878  const TargetRegisterClass *RC,
879  const TargetRegisterInfo *TRI) const {
880  MachineFunction *MF = MBB.getParent();
882  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
883  const DebugLoc &DL = MBB.findDebugLoc(MI);
884 
885  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
886  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
887  MachinePointerInfo PtrInfo
888  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
889  MachineMemOperand *MMO
891  Size, Align);
892  unsigned SpillSize = TRI->getSpillSize(*RC);
893 
894  if (RI.isSGPRClass(RC)) {
895  MFI->setHasSpilledSGPRs();
896 
897  // We are only allowed to create one new instruction when spilling
898  // registers, so we need to use pseudo instruction for spilling SGPRs.
899  const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
900 
901  // The SGPR spill/restore instructions only work on number sgprs, so we need
902  // to make sure we are using the correct register class.
903  if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
905  MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
906  }
907 
908  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
909  .addReg(SrcReg, getKillRegState(isKill)) // data
910  .addFrameIndex(FrameIndex) // addr
911  .addMemOperand(MMO)
913  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
914  // Add the scratch resource registers as implicit uses because we may end up
915  // needing them, and need to ensure that the reserved registers are
916  // correctly handled.
917 
918  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
919  if (ST.hasScalarStores()) {
920  // m0 is used for offset to scalar stores if used to spill.
921  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
922  }
923 
924  return;
925  }
926 
927  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
928 
929  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
930  MFI->setHasSpilledVGPRs();
931  BuildMI(MBB, MI, DL, get(Opcode))
932  .addReg(SrcReg, getKillRegState(isKill)) // data
933  .addFrameIndex(FrameIndex) // addr
934  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
935  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
936  .addImm(0) // offset
937  .addMemOperand(MMO);
938 }
939 
940 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
941  switch (Size) {
942  case 4:
943  return AMDGPU::SI_SPILL_S32_RESTORE;
944  case 8:
945  return AMDGPU::SI_SPILL_S64_RESTORE;
946  case 16:
947  return AMDGPU::SI_SPILL_S128_RESTORE;
948  case 32:
949  return AMDGPU::SI_SPILL_S256_RESTORE;
950  case 64:
951  return AMDGPU::SI_SPILL_S512_RESTORE;
952  default:
953  llvm_unreachable("unknown register size");
954  }
955 }
956 
957 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
958  switch (Size) {
959  case 4:
960  return AMDGPU::SI_SPILL_V32_RESTORE;
961  case 8:
962  return AMDGPU::SI_SPILL_V64_RESTORE;
963  case 12:
964  return AMDGPU::SI_SPILL_V96_RESTORE;
965  case 16:
966  return AMDGPU::SI_SPILL_V128_RESTORE;
967  case 32:
968  return AMDGPU::SI_SPILL_V256_RESTORE;
969  case 64:
970  return AMDGPU::SI_SPILL_V512_RESTORE;
971  default:
972  llvm_unreachable("unknown register size");
973  }
974 }
975 
978  unsigned DestReg, int FrameIndex,
979  const TargetRegisterClass *RC,
980  const TargetRegisterInfo *TRI) const {
981  MachineFunction *MF = MBB.getParent();
983  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
984  const DebugLoc &DL = MBB.findDebugLoc(MI);
985  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
986  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
987  unsigned SpillSize = TRI->getSpillSize(*RC);
988 
989  MachinePointerInfo PtrInfo
990  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
991 
993  PtrInfo, MachineMemOperand::MOLoad, Size, Align);
994 
995  if (RI.isSGPRClass(RC)) {
996  MFI->setHasSpilledSGPRs();
997 
998  // FIXME: Maybe this should not include a memoperand because it will be
999  // lowered to non-memory instructions.
1000  const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1001  if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
1003  MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
1004  }
1005 
1006  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
1007  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
1008  .addFrameIndex(FrameIndex) // addr
1009  .addMemOperand(MMO)
1011  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
1012 
1013  if (ST.hasScalarStores()) {
1014  // m0 is used for offset to scalar stores if used to spill.
1015  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
1016  }
1017 
1018  return;
1019  }
1020 
1021  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
1022 
1023  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
1024  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1025  .addFrameIndex(FrameIndex) // vaddr
1026  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
1027  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
1028  .addImm(0) // offset
1029  .addMemOperand(MMO);
1030 }
1031 
1032 /// \param @Offset Offset in bytes of the FrameIndex being spilled
1034  MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
1035  unsigned FrameOffset, unsigned Size) const {
1036  MachineFunction *MF = MBB.getParent();
1038  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1039  const DebugLoc &DL = MBB.findDebugLoc(MI);
1040  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
1041  unsigned WavefrontSize = ST.getWavefrontSize();
1042 
1043  unsigned TIDReg = MFI->getTIDReg();
1044  if (!MFI->hasCalculatedTID()) {
1045  MachineBasicBlock &Entry = MBB.getParent()->front();
1046  MachineBasicBlock::iterator Insert = Entry.front();
1047  const DebugLoc &DL = Insert->getDebugLoc();
1048 
1049  TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
1050  *MF);
1051  if (TIDReg == AMDGPU::NoRegister)
1052  return TIDReg;
1053 
1055  WorkGroupSize > WavefrontSize) {
1056  unsigned TIDIGXReg
1058  unsigned TIDIGYReg
1060  unsigned TIDIGZReg
1062  unsigned InputPtrReg =
1064  for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
1065  if (!Entry.isLiveIn(Reg))
1066  Entry.addLiveIn(Reg);
1067  }
1068 
1069  RS->enterBasicBlock(Entry);
1070  // FIXME: Can we scavenge an SReg_64 and access the subregs?
1071  unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1072  unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1073  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
1074  .addReg(InputPtrReg)
1076  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
1077  .addReg(InputPtrReg)
1079 
1080  // NGROUPS.X * NGROUPS.Y
1081  BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
1082  .addReg(STmp1)
1083  .addReg(STmp0);
1084  // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1085  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1086  .addReg(STmp1)
1087  .addReg(TIDIGXReg);
1088  // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1089  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1090  .addReg(STmp0)
1091  .addReg(TIDIGYReg)
1092  .addReg(TIDReg);
1093  // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1094  getAddNoCarry(Entry, Insert, DL, TIDReg)
1095  .addReg(TIDReg)
1096  .addReg(TIDIGZReg);
1097  } else {
1098  // Get the wave id
1099  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1100  TIDReg)
1101  .addImm(-1)
1102  .addImm(0);
1103 
1104  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1105  TIDReg)
1106  .addImm(-1)
1107  .addReg(TIDReg);
1108  }
1109 
1110  BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1111  TIDReg)
1112  .addImm(2)
1113  .addReg(TIDReg);
1114  MFI->setTIDReg(TIDReg);
1115  }
1116 
1117  // Add FrameIndex to LDS offset
1118  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1119  getAddNoCarry(MBB, MI, DL, TmpReg)
1120  .addImm(LDSOffset)
1121  .addReg(TIDReg);
1122 
1123  return TmpReg;
1124 }
1125 
1128  int Count) const {
1129  DebugLoc DL = MBB.findDebugLoc(MI);
1130  while (Count > 0) {
1131  int Arg;
1132  if (Count >= 8)
1133  Arg = 7;
1134  else
1135  Arg = Count - 1;
1136  Count -= 8;
1137  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1138  .addImm(Arg);
1139  }
1140 }
1141 
1144  insertWaitStates(MBB, MI, 1);
1145 }
1146 
1148  auto MF = MBB.getParent();
1150 
1151  assert(Info->isEntryFunction());
1152 
1153  if (MBB.succ_empty()) {
1154  bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1155  if (HasNoTerminator)
1156  BuildMI(MBB, MBB.end(), DebugLoc(),
1157  get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
1158  }
1159 }
1160 
1162  switch (MI.getOpcode()) {
1163  default: return 1; // FIXME: Do wait states equal cycles?
1164 
1165  case AMDGPU::S_NOP:
1166  return MI.getOperand(0).getImm() + 1;
1167  }
1168 }
1169 
1171  MachineBasicBlock &MBB = *MI.getParent();
1172  DebugLoc DL = MBB.findDebugLoc(MI);
1173  switch (MI.getOpcode()) {
1174  default: return TargetInstrInfo::expandPostRAPseudo(MI);
1175  case AMDGPU::S_MOV_B64_term:
1176  // This is only a terminator to get the correct spill code placement during
1177  // register allocation.
1178  MI.setDesc(get(AMDGPU::S_MOV_B64));
1179  break;
1180 
1181  case AMDGPU::S_XOR_B64_term:
1182  // This is only a terminator to get the correct spill code placement during
1183  // register allocation.
1184  MI.setDesc(get(AMDGPU::S_XOR_B64));
1185  break;
1186 
1187  case AMDGPU::S_ANDN2_B64_term:
1188  // This is only a terminator to get the correct spill code placement during
1189  // register allocation.
1190  MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1191  break;
1192 
1193  case AMDGPU::V_MOV_B64_PSEUDO: {
1194  unsigned Dst = MI.getOperand(0).getReg();
1195  unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1196  unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1197 
1198  const MachineOperand &SrcOp = MI.getOperand(1);
1199  // FIXME: Will this work for 64-bit floating point immediates?
1200  assert(!SrcOp.isFPImm());
1201  if (SrcOp.isImm()) {
1202  APInt Imm(64, SrcOp.getImm());
1203  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1204  .addImm(Imm.getLoBits(32).getZExtValue())
1205  .addReg(Dst, RegState::Implicit | RegState::Define);
1206  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1207  .addImm(Imm.getHiBits(32).getZExtValue())
1208  .addReg(Dst, RegState::Implicit | RegState::Define);
1209  } else {
1210  assert(SrcOp.isReg());
1211  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1212  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1214  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1215  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1217  }
1218  MI.eraseFromParent();
1219  break;
1220  }
1221  case AMDGPU::V_SET_INACTIVE_B32: {
1222  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1223  .addReg(AMDGPU::EXEC);
1224  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1225  .add(MI.getOperand(2));
1226  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1227  .addReg(AMDGPU::EXEC);
1228  MI.eraseFromParent();
1229  break;
1230  }
1231  case AMDGPU::V_SET_INACTIVE_B64: {
1232  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1233  .addReg(AMDGPU::EXEC);
1234  MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1235  MI.getOperand(0).getReg())
1236  .add(MI.getOperand(2));
1237  expandPostRAPseudo(*Copy);
1238  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1239  .addReg(AMDGPU::EXEC);
1240  MI.eraseFromParent();
1241  break;
1242  }
1243  case AMDGPU::V_MOVRELD_B32_V1:
1244  case AMDGPU::V_MOVRELD_B32_V2:
1245  case AMDGPU::V_MOVRELD_B32_V4:
1246  case AMDGPU::V_MOVRELD_B32_V8:
1247  case AMDGPU::V_MOVRELD_B32_V16: {
1248  const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1249  unsigned VecReg = MI.getOperand(0).getReg();
1250  bool IsUndef = MI.getOperand(1).isUndef();
1251  unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1252  assert(VecReg == MI.getOperand(1).getReg());
1253 
1254  MachineInstr *MovRel =
1255  BuildMI(MBB, MI, DL, MovRelDesc)
1256  .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1257  .add(MI.getOperand(2))
1258  .addReg(VecReg, RegState::ImplicitDefine)
1259  .addReg(VecReg,
1260  RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1261 
1262  const int ImpDefIdx =
1263  MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1264  const int ImpUseIdx = ImpDefIdx + 1;
1265  MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1266 
1267  MI.eraseFromParent();
1268  break;
1269  }
1270  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1271  MachineFunction &MF = *MBB.getParent();
1272  unsigned Reg = MI.getOperand(0).getReg();
1273  unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1274  unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1275 
1276  // Create a bundle so these instructions won't be re-ordered by the
1277  // post-RA scheduler.
1278  MIBundleBuilder Bundler(MBB, MI);
1279  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1280 
1281  // Add 32-bit offset from this instruction to the start of the
1282  // constant data.
1283  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1284  .addReg(RegLo)
1285  .add(MI.getOperand(1)));
1286 
1287  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1288  .addReg(RegHi);
1290  MIB.addImm(0);
1291  else
1292  MIB.add(MI.getOperand(2));
1293 
1294  Bundler.append(MIB);
1295  finalizeBundle(MBB, Bundler.begin());
1296 
1297  MI.eraseFromParent();
1298  break;
1299  }
1300  case AMDGPU::EXIT_WWM: {
1301  // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
1302  // is exited.
1303  MI.setDesc(get(AMDGPU::S_MOV_B64));
1304  break;
1305  }
1306  case TargetOpcode::BUNDLE: {
1307  if (!MI.mayLoad())
1308  return false;
1309 
1310  // If it is a load it must be a memory clause
1312  I->isBundledWithSucc(); ++I) {
1313  I->unbundleFromSucc();
1314  for (MachineOperand &MO : I->operands())
1315  if (MO.isReg())
1316  MO.setIsInternalRead(false);
1317  }
1318 
1319  MI.eraseFromParent();
1320  break;
1321  }
1322  }
1323  return true;
1324 }
1325 
1327  MachineOperand &Src0,
1328  unsigned Src0OpName,
1329  MachineOperand &Src1,
1330  unsigned Src1OpName) const {
1331  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1332  if (!Src0Mods)
1333  return false;
1334 
1335  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1336  assert(Src1Mods &&
1337  "All commutable instructions have both src0 and src1 modifiers");
1338 
1339  int Src0ModsVal = Src0Mods->getImm();
1340  int Src1ModsVal = Src1Mods->getImm();
1341 
1342  Src1Mods->setImm(Src0ModsVal);
1343  Src0Mods->setImm(Src1ModsVal);
1344  return true;
1345 }
1346 
1348  MachineOperand &RegOp,
1349  MachineOperand &NonRegOp) {
1350  unsigned Reg = RegOp.getReg();
1351  unsigned SubReg = RegOp.getSubReg();
1352  bool IsKill = RegOp.isKill();
1353  bool IsDead = RegOp.isDead();
1354  bool IsUndef = RegOp.isUndef();
1355  bool IsDebug = RegOp.isDebug();
1356 
1357  if (NonRegOp.isImm())
1358  RegOp.ChangeToImmediate(NonRegOp.getImm());
1359  else if (NonRegOp.isFI())
1360  RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1361  else
1362  return nullptr;
1363 
1364  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1365  NonRegOp.setSubReg(SubReg);
1366 
1367  return &MI;
1368 }
1369 
1371  unsigned Src0Idx,
1372  unsigned Src1Idx) const {
1373  assert(!NewMI && "this should never be used");
1374 
1375  unsigned Opc = MI.getOpcode();
1376  int CommutedOpcode = commuteOpcode(Opc);
1377  if (CommutedOpcode == -1)
1378  return nullptr;
1379 
1380  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1381  static_cast<int>(Src0Idx) &&
1382  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1383  static_cast<int>(Src1Idx) &&
1384  "inconsistency with findCommutedOpIndices");
1385 
1386  MachineOperand &Src0 = MI.getOperand(Src0Idx);
1387  MachineOperand &Src1 = MI.getOperand(Src1Idx);
1388 
1389  MachineInstr *CommutedMI = nullptr;
1390  if (Src0.isReg() && Src1.isReg()) {
1391  if (isOperandLegal(MI, Src1Idx, &Src0)) {
1392  // Be sure to copy the source modifiers to the right place.
1393  CommutedMI
1394  = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1395  }
1396 
1397  } else if (Src0.isReg() && !Src1.isReg()) {
1398  // src0 should always be able to support any operand type, so no need to
1399  // check operand legality.
1400  CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1401  } else if (!Src0.isReg() && Src1.isReg()) {
1402  if (isOperandLegal(MI, Src1Idx, &Src0))
1403  CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1404  } else {
1405  // FIXME: Found two non registers to commute. This does happen.
1406  return nullptr;
1407  }
1408 
1409  if (CommutedMI) {
1410  swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1411  Src1, AMDGPU::OpName::src1_modifiers);
1412 
1413  CommutedMI->setDesc(get(CommutedOpcode));
1414  }
1415 
1416  return CommutedMI;
1417 }
1418 
1419 // This needs to be implemented because the source modifiers may be inserted
1420 // between the true commutable operands, and the base
1421 // TargetInstrInfo::commuteInstruction uses it.
1423  unsigned &SrcOpIdx1) const {
1424  return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
1425 }
1426 
1427 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
1428  unsigned &SrcOpIdx1) const {
1429  if (!Desc.isCommutable())
1430  return false;
1431 
1432  unsigned Opc = Desc.getOpcode();
1433  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1434  if (Src0Idx == -1)
1435  return false;
1436 
1437  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1438  if (Src1Idx == -1)
1439  return false;
1440 
1441  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1442 }
1443 
1444 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1445  int64_t BrOffset) const {
1446  // BranchRelaxation should never have to check s_setpc_b64 because its dest
1447  // block is unanalyzable.
1448  assert(BranchOp != AMDGPU::S_SETPC_B64);
1449 
1450  // Convert to dwords.
1451  BrOffset /= 4;
1452 
1453  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1454  // from the next instruction.
1455  BrOffset -= 1;
1456 
1457  return isIntN(BranchOffsetBits, BrOffset);
1458 }
1459 
1461  const MachineInstr &MI) const {
1462  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1463  // This would be a difficult analysis to perform, but can always be legal so
1464  // there's no need to analyze it.
1465  return nullptr;
1466  }
1467 
1468  return MI.getOperand(0).getMBB();
1469 }
1470 
1472  MachineBasicBlock &DestBB,
1473  const DebugLoc &DL,
1474  int64_t BrOffset,
1475  RegScavenger *RS) const {
1476  assert(RS && "RegScavenger required for long branching");
1477  assert(MBB.empty() &&
1478  "new block should be inserted for expanding unconditional branch");
1479  assert(MBB.pred_size() == 1);
1480 
1481  MachineFunction *MF = MBB.getParent();
1482  MachineRegisterInfo &MRI = MF->getRegInfo();
1483 
1484  // FIXME: Virtual register workaround for RegScavenger not working with empty
1485  // blocks.
1486  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1487 
1488  auto I = MBB.end();
1489 
1490  // We need to compute the offset relative to the instruction immediately after
1491  // s_getpc_b64. Insert pc arithmetic code before last terminator.
1492  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1493 
1494  // TODO: Handle > 32-bit block address.
1495  if (BrOffset >= 0) {
1496  BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1497  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1498  .addReg(PCReg, 0, AMDGPU::sub0)
1500  BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1501  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1502  .addReg(PCReg, 0, AMDGPU::sub1)
1503  .addImm(0);
1504  } else {
1505  // Backwards branch.
1506  BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1507  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1508  .addReg(PCReg, 0, AMDGPU::sub0)
1510  BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1511  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1512  .addReg(PCReg, 0, AMDGPU::sub1)
1513  .addImm(0);
1514  }
1515 
1516  // Insert the indirect branch after the other terminator.
1517  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1518  .addReg(PCReg);
1519 
1520  // FIXME: If spilling is necessary, this will fail because this scavenger has
1521  // no emergency stack slots. It is non-trivial to spill in this situation,
1522  // because the restore code needs to be specially placed after the
1523  // jump. BranchRelaxation then needs to be made aware of the newly inserted
1524  // block.
1525  //
1526  // If a spill is needed for the pc register pair, we need to insert a spill
1527  // restore block right before the destination block, and insert a short branch
1528  // into the old destination block's fallthrough predecessor.
1529  // e.g.:
1530  //
1531  // s_cbranch_scc0 skip_long_branch:
1532  //
1533  // long_branch_bb:
1534  // spill s[8:9]
1535  // s_getpc_b64 s[8:9]
1536  // s_add_u32 s8, s8, restore_bb
1537  // s_addc_u32 s9, s9, 0
1538  // s_setpc_b64 s[8:9]
1539  //
1540  // skip_long_branch:
1541  // foo;
1542  //
1543  // .....
1544  //
1545  // dest_bb_fallthrough_predecessor:
1546  // bar;
1547  // s_branch dest_bb
1548  //
1549  // restore_bb:
1550  // restore s[8:9]
1551  // fallthrough dest_bb
1552  ///
1553  // dest_bb:
1554  // buzz;
1555 
1556  RS->enterBasicBlockEnd(MBB);
1557  unsigned Scav = RS->scavengeRegisterBackwards(
1558  AMDGPU::SReg_64RegClass,
1559  MachineBasicBlock::iterator(GetPC), false, 0);
1560  MRI.replaceRegWith(PCReg, Scav);
1561  MRI.clearVirtRegs();
1562  RS->setRegUsed(Scav);
1563 
1564  return 4 + 8 + 4 + 4;
1565 }
1566 
1567 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1568  switch (Cond) {
1569  case SIInstrInfo::SCC_TRUE:
1570  return AMDGPU::S_CBRANCH_SCC1;
1571  case SIInstrInfo::SCC_FALSE:
1572  return AMDGPU::S_CBRANCH_SCC0;
1573  case SIInstrInfo::VCCNZ:
1574  return AMDGPU::S_CBRANCH_VCCNZ;
1575  case SIInstrInfo::VCCZ:
1576  return AMDGPU::S_CBRANCH_VCCZ;
1577  case SIInstrInfo::EXECNZ:
1578  return AMDGPU::S_CBRANCH_EXECNZ;
1579  case SIInstrInfo::EXECZ:
1580  return AMDGPU::S_CBRANCH_EXECZ;
1581  default:
1582  llvm_unreachable("invalid branch predicate");
1583  }
1584 }
1585 
1586 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1587  switch (Opcode) {
1588  case AMDGPU::S_CBRANCH_SCC0:
1589  return SCC_FALSE;
1590  case AMDGPU::S_CBRANCH_SCC1:
1591  return SCC_TRUE;
1592  case AMDGPU::S_CBRANCH_VCCNZ:
1593  return VCCNZ;
1594  case AMDGPU::S_CBRANCH_VCCZ:
1595  return VCCZ;
1596  case AMDGPU::S_CBRANCH_EXECNZ:
1597  return EXECNZ;
1598  case AMDGPU::S_CBRANCH_EXECZ:
1599  return EXECZ;
1600  default:
1601  return INVALID_BR;
1602  }
1603 }
1604 
1607  MachineBasicBlock *&TBB,
1608  MachineBasicBlock *&FBB,
1610  bool AllowModify) const {
1611  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1612  // Unconditional Branch
1613  TBB = I->getOperand(0).getMBB();
1614  return false;
1615  }
1616 
1617  MachineBasicBlock *CondBB = nullptr;
1618 
1619  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1620  CondBB = I->getOperand(1).getMBB();
1621  Cond.push_back(I->getOperand(0));
1622  } else {
1623  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1624  if (Pred == INVALID_BR)
1625  return true;
1626 
1627  CondBB = I->getOperand(0).getMBB();
1629  Cond.push_back(I->getOperand(1)); // Save the branch register.
1630  }
1631  ++I;
1632 
1633  if (I == MBB.end()) {
1634  // Conditional branch followed by fall-through.
1635  TBB = CondBB;
1636  return false;
1637  }
1638 
1639  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1640  TBB = CondBB;
1641  FBB = I->getOperand(0).getMBB();
1642  return false;
1643  }
1644 
1645  return true;
1646 }
1647 
1649  MachineBasicBlock *&FBB,
1651  bool AllowModify) const {
1653  auto E = MBB.end();
1654  if (I == E)
1655  return false;
1656 
1657  // Skip over the instructions that are artificially terminators for special
1658  // exec management.
1659  while (I != E && !I->isBranch() && !I->isReturn() &&
1660  I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
1661  switch (I->getOpcode()) {
1662  case AMDGPU::SI_MASK_BRANCH:
1663  case AMDGPU::S_MOV_B64_term:
1664  case AMDGPU::S_XOR_B64_term:
1665  case AMDGPU::S_ANDN2_B64_term:
1666  break;
1667  case AMDGPU::SI_IF:
1668  case AMDGPU::SI_ELSE:
1669  case AMDGPU::SI_KILL_I1_TERMINATOR:
1670  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1671  // FIXME: It's messy that these need to be considered here at all.
1672  return true;
1673  default:
1674  llvm_unreachable("unexpected non-branch terminator inst");
1675  }
1676 
1677  ++I;
1678  }
1679 
1680  if (I == E)
1681  return false;
1682 
1683  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1684  return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1685 
1686  ++I;
1687 
1688  // TODO: Should be able to treat as fallthrough?
1689  if (I == MBB.end())
1690  return true;
1691 
1692  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1693  return true;
1694 
1695  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1696 
1697  // Specifically handle the case where the conditional branch is to the same
1698  // destination as the mask branch. e.g.
1699  //
1700  // si_mask_branch BB8
1701  // s_cbranch_execz BB8
1702  // s_cbranch BB9
1703  //
1704  // This is required to understand divergent loops which may need the branches
1705  // to be relaxed.
1706  if (TBB != MaskBrDest || Cond.empty())
1707  return true;
1708 
1709  auto Pred = Cond[0].getImm();
1710  return (Pred != EXECZ && Pred != EXECNZ);
1711 }
1712 
1714  int *BytesRemoved) const {
1716 
1717  unsigned Count = 0;
1718  unsigned RemovedSize = 0;
1719  while (I != MBB.end()) {
1720  MachineBasicBlock::iterator Next = std::next(I);
1721  if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1722  I = Next;
1723  continue;
1724  }
1725 
1726  RemovedSize += getInstSizeInBytes(*I);
1727  I->eraseFromParent();
1728  ++Count;
1729  I = Next;
1730  }
1731 
1732  if (BytesRemoved)
1733  *BytesRemoved = RemovedSize;
1734 
1735  return Count;
1736 }
1737 
1738 // Copy the flags onto the implicit condition register operand.
1740  const MachineOperand &OrigCond) {
1741  CondReg.setIsUndef(OrigCond.isUndef());
1742  CondReg.setIsKill(OrigCond.isKill());
1743 }
1744 
1746  MachineBasicBlock *TBB,
1747  MachineBasicBlock *FBB,
1749  const DebugLoc &DL,
1750  int *BytesAdded) const {
1751  if (!FBB && Cond.empty()) {
1752  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1753  .addMBB(TBB);
1754  if (BytesAdded)
1755  *BytesAdded = 4;
1756  return 1;
1757  }
1758 
1759  if(Cond.size() == 1 && Cond[0].isReg()) {
1760  BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1761  .add(Cond[0])
1762  .addMBB(TBB);
1763  return 1;
1764  }
1765 
1766  assert(TBB && Cond[0].isImm());
1767 
1768  unsigned Opcode
1769  = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1770 
1771  if (!FBB) {
1772  Cond[1].isUndef();
1773  MachineInstr *CondBr =
1774  BuildMI(&MBB, DL, get(Opcode))
1775  .addMBB(TBB);
1776 
1777  // Copy the flags onto the implicit condition register operand.
1778  preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1779 
1780  if (BytesAdded)
1781  *BytesAdded = 4;
1782  return 1;
1783  }
1784 
1785  assert(TBB && FBB);
1786 
1787  MachineInstr *CondBr =
1788  BuildMI(&MBB, DL, get(Opcode))
1789  .addMBB(TBB);
1790  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1791  .addMBB(FBB);
1792 
1793  MachineOperand &CondReg = CondBr->getOperand(1);
1794  CondReg.setIsUndef(Cond[1].isUndef());
1795  CondReg.setIsKill(Cond[1].isKill());
1796 
1797  if (BytesAdded)
1798  *BytesAdded = 8;
1799 
1800  return 2;
1801 }
1802 
1804  SmallVectorImpl<MachineOperand> &Cond) const {
1805  if (Cond.size() != 2) {
1806  return true;
1807  }
1808 
1809  if (Cond[0].isImm()) {
1810  Cond[0].setImm(-Cond[0].getImm());
1811  return false;
1812  }
1813 
1814  return true;
1815 }
1816 
1819  unsigned TrueReg, unsigned FalseReg,
1820  int &CondCycles,
1821  int &TrueCycles, int &FalseCycles) const {
1822  switch (Cond[0].getImm()) {
1823  case VCCNZ:
1824  case VCCZ: {
1825  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1826  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1827  assert(MRI.getRegClass(FalseReg) == RC);
1828 
1829  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1830  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1831 
1832  // Limit to equal cost for branch vs. N v_cndmask_b32s.
1833  return !RI.isSGPRClass(RC) && NumInsts <= 6;
1834  }
1835  case SCC_TRUE:
1836  case SCC_FALSE: {
1837  // FIXME: We could insert for VGPRs if we could replace the original compare
1838  // with a vector one.
1839  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1840  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1841  assert(MRI.getRegClass(FalseReg) == RC);
1842 
1843  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1844 
1845  // Multiples of 8 can do s_cselect_b64
1846  if (NumInsts % 2 == 0)
1847  NumInsts /= 2;
1848 
1849  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1850  return RI.isSGPRClass(RC);
1851  }
1852  default:
1853  return false;
1854  }
1855 }
1856 
1859  unsigned DstReg, ArrayRef<MachineOperand> Cond,
1860  unsigned TrueReg, unsigned FalseReg) const {
1861  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1862  if (Pred == VCCZ || Pred == SCC_FALSE) {
1863  Pred = static_cast<BranchPredicate>(-Pred);
1864  std::swap(TrueReg, FalseReg);
1865  }
1866 
1868  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1869  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1870 
1871  if (DstSize == 32) {
1872  unsigned SelOp = Pred == SCC_TRUE ?
1873  AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1874 
1875  // Instruction's operands are backwards from what is expected.
1876  MachineInstr *Select =
1877  BuildMI(MBB, I, DL, get(SelOp), DstReg)
1878  .addReg(FalseReg)
1879  .addReg(TrueReg);
1880 
1881  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1882  return;
1883  }
1884 
1885  if (DstSize == 64 && Pred == SCC_TRUE) {
1886  MachineInstr *Select =
1887  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1888  .addReg(FalseReg)
1889  .addReg(TrueReg);
1890 
1891  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1892  return;
1893  }
1894 
1895  static const int16_t Sub0_15[] = {
1896  AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1897  AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1898  AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1899  AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1900  };
1901 
1902  static const int16_t Sub0_15_64[] = {
1903  AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1904  AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1905  AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1906  AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1907  };
1908 
1909  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1910  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1911  const int16_t *SubIndices = Sub0_15;
1912  int NElts = DstSize / 32;
1913 
1914  // 64-bit select is only avaialble for SALU.
1915  if (Pred == SCC_TRUE) {
1916  SelOp = AMDGPU::S_CSELECT_B64;
1917  EltRC = &AMDGPU::SGPR_64RegClass;
1918  SubIndices = Sub0_15_64;
1919 
1920  assert(NElts % 2 == 0);
1921  NElts /= 2;
1922  }
1923 
1925  MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1926 
1927  I = MIB->getIterator();
1928 
1930  for (int Idx = 0; Idx != NElts; ++Idx) {
1931  unsigned DstElt = MRI.createVirtualRegister(EltRC);
1932  Regs.push_back(DstElt);
1933 
1934  unsigned SubIdx = SubIndices[Idx];
1935 
1936  MachineInstr *Select =
1937  BuildMI(MBB, I, DL, get(SelOp), DstElt)
1938  .addReg(FalseReg, 0, SubIdx)
1939  .addReg(TrueReg, 0, SubIdx);
1940  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1941 
1942  MIB.addReg(DstElt)
1943  .addImm(SubIdx);
1944  }
1945 }
1946 
1948  switch (MI.getOpcode()) {
1949  case AMDGPU::V_MOV_B32_e32:
1950  case AMDGPU::V_MOV_B32_e64:
1951  case AMDGPU::V_MOV_B64_PSEUDO: {
1952  // If there are additional implicit register operands, this may be used for
1953  // register indexing so the source register operand isn't simply copied.
1954  unsigned NumOps = MI.getDesc().getNumOperands() +
1955  MI.getDesc().getNumImplicitUses();
1956 
1957  return MI.getNumOperands() == NumOps;
1958  }
1959  case AMDGPU::S_MOV_B32:
1960  case AMDGPU::S_MOV_B64:
1961  case AMDGPU::COPY:
1962  return true;
1963  default:
1964  return false;
1965  }
1966 }
1967 
1969  unsigned Kind) const {
1970  switch(Kind) {
1981  }
1982  return AMDGPUAS::FLAT_ADDRESS;
1983 }
1984 
1986  unsigned Opc = MI.getOpcode();
1987  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1988  AMDGPU::OpName::src0_modifiers);
1989  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1990  AMDGPU::OpName::src1_modifiers);
1991  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1992  AMDGPU::OpName::src2_modifiers);
1993 
1994  MI.RemoveOperand(Src2ModIdx);
1995  MI.RemoveOperand(Src1ModIdx);
1996  MI.RemoveOperand(Src0ModIdx);
1997 }
1998 
2000  unsigned Reg, MachineRegisterInfo *MRI) const {
2001  if (!MRI->hasOneNonDBGUse(Reg))
2002  return false;
2003 
2004  switch (DefMI.getOpcode()) {
2005  default:
2006  return false;
2007  case AMDGPU::S_MOV_B64:
2008  // TODO: We could fold 64-bit immediates, but this get compilicated
2009  // when there are sub-registers.
2010  return false;
2011 
2012  case AMDGPU::V_MOV_B32_e32:
2013  case AMDGPU::S_MOV_B32:
2014  break;
2015  }
2016 
2017  const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
2018  assert(ImmOp);
2019  // FIXME: We could handle FrameIndex values here.
2020  if (!ImmOp->isImm())
2021  return false;
2022 
2023  unsigned Opc = UseMI.getOpcode();
2024  if (Opc == AMDGPU::COPY) {
2025  bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
2026  unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2027  UseMI.setDesc(get(NewOpc));
2028  UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
2029  UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2030  return true;
2031  }
2032 
2033  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2034  Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
2035  // Don't fold if we are using source or output modifiers. The new VOP2
2036  // instructions don't have them.
2037  if (hasAnyModifiersSet(UseMI))
2038  return false;
2039 
2040  // If this is a free constant, there's no reason to do this.
2041  // TODO: We could fold this here instead of letting SIFoldOperands do it
2042  // later.
2043  MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2044 
2045  // Any src operand can be used for the legality check.
2046  if (isInlineConstant(UseMI, *Src0, *ImmOp))
2047  return false;
2048 
2049  bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
2050  MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2051  MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2052 
2053  // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2054  // We should only expect these to be on src0 due to canonicalizations.
2055  if (Src0->isReg() && Src0->getReg() == Reg) {
2056  if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2057  return false;
2058 
2059  if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2060  return false;
2061 
2062  // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2063 
2064  const int64_t Imm = ImmOp->getImm();
2065 
2066  // FIXME: This would be a lot easier if we could return a new instruction
2067  // instead of having to modify in place.
2068 
2069  // Remove these first since they are at the end.
2070  UseMI.RemoveOperand(
2071  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2072  UseMI.RemoveOperand(
2073  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2074 
2075  unsigned Src1Reg = Src1->getReg();
2076  unsigned Src1SubReg = Src1->getSubReg();
2077  Src0->setReg(Src1Reg);
2078  Src0->setSubReg(Src1SubReg);
2079  Src0->setIsKill(Src1->isKill());
2080 
2081  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2082  Opc == AMDGPU::V_MAC_F16_e64)
2083  UseMI.untieRegOperand(
2084  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2085 
2086  Src1->ChangeToImmediate(Imm);
2087 
2088  removeModOperands(UseMI);
2089  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
2090 
2091  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2092  if (DeleteDef)
2093  DefMI.eraseFromParent();
2094 
2095  return true;
2096  }
2097 
2098  // Added part is the constant: Use v_madak_{f16, f32}.
2099  if (Src2->isReg() && Src2->getReg() == Reg) {
2100  // Not allowed to use constant bus for another operand.
2101  // We can however allow an inline immediate as src0.
2102  bool Src0Inlined = false;
2103  if (Src0->isReg()) {
2104  // Try to inline constant if possible.
2105  // If the Def moves immediate and the use is single
2106  // We are saving VGPR here.
2107  MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
2108  if (Def && Def->isMoveImmediate() &&
2109  isInlineConstant(Def->getOperand(1)) &&
2110  MRI->hasOneUse(Src0->getReg())) {
2111  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2112  Src0Inlined = true;
2113  } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
2114  RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
2115  (RI.isVirtualRegister(Src0->getReg()) &&
2116  RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
2117  return false;
2118  // VGPR is okay as Src0 - fallthrough
2119  }
2120 
2121  if (Src1->isReg() && !Src0Inlined ) {
2122  // We have one slot for inlinable constant so far - try to fill it
2123  MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
2124  if (Def && Def->isMoveImmediate() &&
2125  isInlineConstant(Def->getOperand(1)) &&
2126  MRI->hasOneUse(Src1->getReg()) &&
2127  commuteInstruction(UseMI)) {
2128  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2129  } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
2130  RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
2131  (RI.isVirtualRegister(Src1->getReg()) &&
2132  RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
2133  return false;
2134  // VGPR is okay as Src1 - fallthrough
2135  }
2136 
2137  const int64_t Imm = ImmOp->getImm();
2138 
2139  // FIXME: This would be a lot easier if we could return a new instruction
2140  // instead of having to modify in place.
2141 
2142  // Remove these first since they are at the end.
2143  UseMI.RemoveOperand(
2144  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2145  UseMI.RemoveOperand(
2146  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2147 
2148  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2149  Opc == AMDGPU::V_MAC_F16_e64)
2150  UseMI.untieRegOperand(
2151  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2152 
2153  // ChangingToImmediate adds Src2 back to the instruction.
2154  Src2->ChangeToImmediate(Imm);
2155 
2156  // These come before src2.
2157  removeModOperands(UseMI);
2158  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
2159 
2160  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2161  if (DeleteDef)
2162  DefMI.eraseFromParent();
2163 
2164  return true;
2165  }
2166  }
2167 
2168  return false;
2169 }
2170 
2171 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2172  int WidthB, int OffsetB) {
2173  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2174  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2175  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2176  return LowOffset + LowWidth <= HighOffset;
2177 }
2178 
2179 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
2180  MachineInstr &MIb) const {
2181  MachineOperand *BaseOp0, *BaseOp1;
2182  int64_t Offset0, Offset1;
2183 
2184  if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
2185  getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
2186  if (!BaseOp0->isIdenticalTo(*BaseOp1))
2187  return false;
2188 
2189  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2190  // FIXME: Handle ds_read2 / ds_write2.
2191  return false;
2192  }
2193  unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2194  unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2195  if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
2196  return true;
2197  }
2198  }
2199 
2200  return false;
2201 }
2202 
2204  MachineInstr &MIb,
2205  AliasAnalysis *AA) const {
2206  assert((MIa.mayLoad() || MIa.mayStore()) &&
2207  "MIa must load from or modify a memory location");
2208  assert((MIb.mayLoad() || MIb.mayStore()) &&
2209  "MIb must load from or modify a memory location");
2210 
2212  return false;
2213 
2214  // XXX - Can we relax this between address spaces?
2215  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2216  return false;
2217 
2218  if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
2219  const MachineMemOperand *MMOa = *MIa.memoperands_begin();
2220  const MachineMemOperand *MMOb = *MIb.memoperands_begin();
2221  if (MMOa->getValue() && MMOb->getValue()) {
2222  MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
2223  MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
2224  if (!AA->alias(LocA, LocB))
2225  return true;
2226  }
2227  }
2228 
2229  // TODO: Should we check the address space from the MachineMemOperand? That
2230  // would allow us to distinguish objects we know don't alias based on the
2231  // underlying address space, even if it was lowered to a different one,
2232  // e.g. private accesses lowered to use MUBUF instructions on a scratch
2233  // buffer.
2234  if (isDS(MIa)) {
2235  if (isDS(MIb))
2236  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2237 
2238  return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
2239  }
2240 
2241  if (isMUBUF(MIa) || isMTBUF(MIa)) {
2242  if (isMUBUF(MIb) || isMTBUF(MIb))
2243  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2244 
2245  return !isFLAT(MIb) && !isSMRD(MIb);
2246  }
2247 
2248  if (isSMRD(MIa)) {
2249  if (isSMRD(MIb))
2250  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2251 
2252  return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2253  }
2254 
2255  if (isFLAT(MIa)) {
2256  if (isFLAT(MIb))
2257  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2258 
2259  return false;
2260  }
2261 
2262  return false;
2263 }
2264 
2265 static int64_t getFoldableImm(const MachineOperand* MO) {
2266  if (!MO->isReg())
2267  return false;
2268  const MachineFunction *MF = MO->getParent()->getParent()->getParent();
2269  const MachineRegisterInfo &MRI = MF->getRegInfo();
2270  auto Def = MRI.getUniqueVRegDef(MO->getReg());
2271  if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
2272  Def->getOperand(1).isImm())
2273  return Def->getOperand(1).getImm();
2274  return AMDGPU::NoRegister;
2275 }
2276 
2278  MachineInstr &MI,
2279  LiveVariables *LV) const {
2280  unsigned Opc = MI.getOpcode();
2281  bool IsF16 = false;
2282  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
2283 
2284  switch (Opc) {
2285  default:
2286  return nullptr;
2287  case AMDGPU::V_MAC_F16_e64:
2288  IsF16 = true;
2290  case AMDGPU::V_MAC_F32_e64:
2291  case AMDGPU::V_FMAC_F32_e64:
2292  break;
2293  case AMDGPU::V_MAC_F16_e32:
2294  IsF16 = true;
2296  case AMDGPU::V_MAC_F32_e32:
2297  case AMDGPU::V_FMAC_F32_e32: {
2298  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2299  AMDGPU::OpName::src0);
2300  const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2301  if (!Src0->isReg() && !Src0->isImm())
2302  return nullptr;
2303 
2304  if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2305  return nullptr;
2306 
2307  break;
2308  }
2309  }
2310 
2311  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2312  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2313  const MachineOperand *Src0Mods =
2314  getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2315  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2316  const MachineOperand *Src1Mods =
2317  getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2318  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2319  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2320  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2321 
2322  if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
2323  // If we have an SGPR input, we will violate the constant bus restriction.
2324  (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
2325  if (auto Imm = getFoldableImm(Src2)) {
2326  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2327  get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
2328  .add(*Dst)
2329  .add(*Src0)
2330  .add(*Src1)
2331  .addImm(Imm);
2332  }
2333  if (auto Imm = getFoldableImm(Src1)) {
2334  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2335  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2336  .add(*Dst)
2337  .add(*Src0)
2338  .addImm(Imm)
2339  .add(*Src2);
2340  }
2341  if (auto Imm = getFoldableImm(Src0)) {
2342  if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
2343  AMDGPU::OpName::src0), Src1))
2344  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2345  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2346  .add(*Dst)
2347  .add(*Src1)
2348  .addImm(Imm)
2349  .add(*Src2);
2350  }
2351  }
2352 
2353  assert((!IsFMA || !IsF16) && "fmac only expected with f32");
2354  unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
2355  (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2356  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2357  .add(*Dst)
2358  .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2359  .add(*Src0)
2360  .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2361  .add(*Src1)
2362  .addImm(0) // Src mods
2363  .add(*Src2)
2364  .addImm(Clamp ? Clamp->getImm() : 0)
2365  .addImm(Omod ? Omod->getImm() : 0);
2366 }
2367 
2368 // It's not generally safe to move VALU instructions across these since it will
2369 // start using the register as a base index rather than directly.
2370 // XXX - Why isn't hasSideEffects sufficient for these?
2372  switch (MI.getOpcode()) {
2373  case AMDGPU::S_SET_GPR_IDX_ON:
2374  case AMDGPU::S_SET_GPR_IDX_MODE:
2375  case AMDGPU::S_SET_GPR_IDX_OFF:
2376  return true;
2377  default:
2378  return false;
2379  }
2380 }
2381 
2383  const MachineBasicBlock *MBB,
2384  const MachineFunction &MF) const {
2385  // XXX - Do we want the SP check in the base implementation?
2386 
2387  // Target-independent instructions do not have an implicit-use of EXEC, even
2388  // when they operate on VGPRs. Treating EXEC modifications as scheduling
2389  // boundaries prevents incorrect movements of such instructions.
2390  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2391  MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2392  MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2393  MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2395 }
2396 
2397 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
2398  return Opcode == AMDGPU::DS_ORDERED_COUNT ||
2399  Opcode == AMDGPU::DS_GWS_INIT ||
2400  Opcode == AMDGPU::DS_GWS_SEMA_V ||
2401  Opcode == AMDGPU::DS_GWS_SEMA_BR ||
2402  Opcode == AMDGPU::DS_GWS_SEMA_P ||
2403  Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
2404  Opcode == AMDGPU::DS_GWS_BARRIER;
2405 }
2406 
2408  unsigned Opcode = MI.getOpcode();
2409 
2410  if (MI.mayStore() && isSMRD(MI))
2411  return true; // scalar store or atomic
2412 
2413  // These instructions cause shader I/O that may cause hardware lockups
2414  // when executed with an empty EXEC mask.
2415  //
2416  // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
2417  // EXEC = 0, but checking for that case here seems not worth it
2418  // given the typical code patterns.
2419  if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
2420  Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
2421  Opcode == AMDGPU::DS_ORDERED_COUNT)
2422  return true;
2423 
2424  if (MI.isInlineAsm())
2425  return true; // conservative assumption
2426 
2427  // These are like SALU instructions in terms of effects, so it's questionable
2428  // whether we should return true for those.
2429  //
2430  // However, executing them with EXEC = 0 causes them to operate on undefined
2431  // data, which we avoid by returning true here.
2432  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
2433  return true;
2434 
2435  return false;
2436 }
2437 
2438 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2439  switch (Imm.getBitWidth()) {
2440  case 32:
2442  ST.hasInv2PiInlineImm());
2443  case 64:
2445  ST.hasInv2PiInlineImm());
2446  case 16:
2447  return ST.has16BitInsts() &&
2449  ST.hasInv2PiInlineImm());
2450  default:
2451  llvm_unreachable("invalid bitwidth");
2452  }
2453 }
2454 
2456  uint8_t OperandType) const {
2457  if (!MO.isImm() ||
2458  OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2459  OperandType > AMDGPU::OPERAND_SRC_LAST)
2460  return false;
2461 
2462  // MachineOperand provides no way to tell the true operand size, since it only
2463  // records a 64-bit value. We need to know the size to determine if a 32-bit
2464  // floating point immediate bit pattern is legal for an integer immediate. It
2465  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2466 
2467  int64_t Imm = MO.getImm();
2468  switch (OperandType) {
2473  int32_t Trunc = static_cast<int32_t>(Imm);
2475  }
2481  ST.hasInv2PiInlineImm());
2486  if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2487  // A few special case instructions have 16-bit operands on subtargets
2488  // where 16-bit instructions are not legal.
2489  // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2490  // constants in these cases
2491  int16_t Trunc = static_cast<int16_t>(Imm);
2492  return ST.has16BitInsts() &&
2494  }
2495 
2496  return false;
2497  }
2500  if (isUInt<16>(Imm)) {
2501  int16_t Trunc = static_cast<int16_t>(Imm);
2502  return ST.has16BitInsts() &&
2504  }
2505  if (!(Imm & 0xffff)) {
2506  return ST.has16BitInsts() &&
2508  }
2509  uint32_t Trunc = static_cast<uint32_t>(Imm);
2511  }
2512  default:
2513  llvm_unreachable("invalid bitwidth");
2514  }
2515 }
2516 
2518  const MCOperandInfo &OpInfo) const {
2519  switch (MO.getType()) {
2521  return false;
2523  return !isInlineConstant(MO, OpInfo);
2529  return true;
2530  default:
2531  llvm_unreachable("unexpected operand type");
2532  }
2533 }
2534 
2535 static bool compareMachineOp(const MachineOperand &Op0,
2536  const MachineOperand &Op1) {
2537  if (Op0.getType() != Op1.getType())
2538  return false;
2539 
2540  switch (Op0.getType()) {
2542  return Op0.getReg() == Op1.getReg();
2544  return Op0.getImm() == Op1.getImm();
2545  default:
2546  llvm_unreachable("Didn't expect to be comparing these operand types");
2547  }
2548 }
2549 
2551  const MachineOperand &MO) const {
2552  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2553 
2554  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2555 
2556  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2557  return true;
2558 
2559  if (OpInfo.RegClass < 0)
2560  return false;
2561 
2562  if (MO.isImm() && isInlineConstant(MO, OpInfo))
2563  return RI.opCanUseInlineConstant(OpInfo.OperandType);
2564 
2565  return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2566 }
2567 
2568 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2569  int Op32 = AMDGPU::getVOPe32(Opcode);
2570  if (Op32 == -1)
2571  return false;
2572 
2573  return pseudoToMCOpcode(Op32) != -1;
2574 }
2575 
2576 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2577  // The src0_modifier operand is present on all instructions
2578  // that have modifiers.
2579 
2580  return AMDGPU::getNamedOperandIdx(Opcode,
2581  AMDGPU::OpName::src0_modifiers) != -1;
2582 }
2583 
2585  unsigned OpName) const {
2586  const MachineOperand *Mods = getNamedOperand(MI, OpName);
2587  return Mods && Mods->getImm();
2588 }
2589 
2591  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2592  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2593  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2594  hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2595  hasModifiersSet(MI, AMDGPU::OpName::omod);
2596 }
2597 
2599  const MachineRegisterInfo &MRI) const {
2600  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2601  // Can't shrink instruction with three operands.
2602  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
2603  // a special case for it. It can only be shrunk if the third operand
2604  // is vcc. We should handle this the same way we handle vopc, by addding
2605  // a register allocation hint pre-regalloc and then do the shrinking
2606  // post-regalloc.
2607  if (Src2) {
2608  switch (MI.getOpcode()) {
2609  default: return false;
2610 
2611  case AMDGPU::V_ADDC_U32_e64:
2612  case AMDGPU::V_SUBB_U32_e64:
2613  case AMDGPU::V_SUBBREV_U32_e64: {
2614  const MachineOperand *Src1
2615  = getNamedOperand(MI, AMDGPU::OpName::src1);
2616  if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
2617  return false;
2618  // Additional verification is needed for sdst/src2.
2619  return true;
2620  }
2621  case AMDGPU::V_MAC_F32_e64:
2622  case AMDGPU::V_MAC_F16_e64:
2623  case AMDGPU::V_FMAC_F32_e64:
2624  if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
2625  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
2626  return false;
2627  break;
2628 
2629  case AMDGPU::V_CNDMASK_B32_e64:
2630  break;
2631  }
2632  }
2633 
2634  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2635  if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
2636  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
2637  return false;
2638 
2639  // We don't need to check src0, all input types are legal, so just make sure
2640  // src0 isn't using any modifiers.
2641  if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
2642  return false;
2643 
2644  // Can it be shrunk to a valid 32 bit opcode?
2645  if (!hasVALU32BitEncoding(MI.getOpcode()))
2646  return false;
2647 
2648  // Check output modifiers
2649  return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
2650  !hasModifiersSet(MI, AMDGPU::OpName::clamp);
2651 }
2652 
2653 // Set VCC operand with all flags from \p Orig, except for setting it as
2654 // implicit.
2656  const MachineOperand &Orig) {
2657 
2658  for (MachineOperand &Use : MI.implicit_operands()) {
2659  if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
2660  Use.setIsUndef(Orig.isUndef());
2661  Use.setIsKill(Orig.isKill());
2662  return;
2663  }
2664  }
2665 }
2666 
2668  unsigned Op32) const {
2669  MachineBasicBlock *MBB = MI.getParent();;
2670  MachineInstrBuilder Inst32 =
2671  BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
2672 
2673  // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
2674  // For VOPC instructions, this is replaced by an implicit def of vcc.
2675  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
2676  if (Op32DstIdx != -1) {
2677  // dst
2678  Inst32.add(MI.getOperand(0));
2679  } else {
2680  assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
2681  "Unexpected case");
2682  }
2683 
2684  Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
2685 
2686  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2687  if (Src1)
2688  Inst32.add(*Src1);
2689 
2690  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2691 
2692  if (Src2) {
2693  int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
2694  if (Op32Src2Idx != -1) {
2695  Inst32.add(*Src2);
2696  } else {
2697  // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
2698  // replaced with an implicit read of vcc. This was already added
2699  // during the initial BuildMI, so find it to preserve the flags.
2700  copyFlagsToImplicitVCC(*Inst32, *Src2);
2701  }
2702  }
2703 
2704  return Inst32;
2705 }
2706 
2708  const MachineOperand &MO,
2709  const MCOperandInfo &OpInfo) const {
2710  // Literal constants use the constant bus.
2711  //if (isLiteralConstantLike(MO, OpInfo))
2712  // return true;
2713  if (MO.isImm())
2714  return !isInlineConstant(MO, OpInfo);
2715 
2716  if (!MO.isReg())
2717  return true; // Misc other operands like FrameIndex
2718 
2719  if (!MO.isUse())
2720  return false;
2721 
2723  return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2724 
2725  // FLAT_SCR is just an SGPR pair.
2726  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2727  return true;
2728 
2729  // EXEC register uses the constant bus.
2730  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2731  return true;
2732 
2733  // SGPRs use the constant bus
2734  return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2735  (!MO.isImplicit() &&
2736  (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2737  AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2738 }
2739 
2740 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2741  for (const MachineOperand &MO : MI.implicit_operands()) {
2742  // We only care about reads.
2743  if (MO.isDef())
2744  continue;
2745 
2746  switch (MO.getReg()) {
2747  case AMDGPU::VCC:
2748  case AMDGPU::M0:
2749  case AMDGPU::FLAT_SCR:
2750  return MO.getReg();
2751 
2752  default:
2753  break;
2754  }
2755  }
2756 
2757  return AMDGPU::NoRegister;
2758 }
2759 
2760 static bool shouldReadExec(const MachineInstr &MI) {
2761  if (SIInstrInfo::isVALU(MI)) {
2762  switch (MI.getOpcode()) {
2763  case AMDGPU::V_READLANE_B32:
2764  case AMDGPU::V_READLANE_B32_si:
2765  case AMDGPU::V_READLANE_B32_vi:
2766  case AMDGPU::V_WRITELANE_B32:
2767  case AMDGPU::V_WRITELANE_B32_si:
2768  case AMDGPU::V_WRITELANE_B32_vi:
2769  return false;
2770  }
2771 
2772  return true;
2773  }
2774 
2775  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2776  SIInstrInfo::isSALU(MI) ||
2777  SIInstrInfo::isSMRD(MI))
2778  return false;
2779 
2780  return true;
2781 }
2782 
2783 static bool isSubRegOf(const SIRegisterInfo &TRI,
2784  const MachineOperand &SuperVec,
2785  const MachineOperand &SubReg) {
2787  return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2788 
2789  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2790  SubReg.getReg() == SuperVec.getReg();
2791 }
2792 
2794  StringRef &ErrInfo) const {
2795  uint16_t Opcode = MI.getOpcode();
2796  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2797  return true;
2798 
2799  const MachineFunction *MF = MI.getParent()->getParent();
2800  const MachineRegisterInfo &MRI = MF->getRegInfo();
2801 
2802  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2803  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2804  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2805 
2806  // Make sure the number of operands is correct.
2807  const MCInstrDesc &Desc = get(Opcode);
2808  if (!Desc.isVariadic() &&
2809  Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2810  ErrInfo = "Instruction has wrong number of operands.";
2811  return false;
2812  }
2813 
2814  if (MI.isInlineAsm()) {
2815  // Verify register classes for inlineasm constraints.
2816  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2817  I != E; ++I) {
2818  const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2819  if (!RC)
2820  continue;
2821 
2822  const MachineOperand &Op = MI.getOperand(I);
2823  if (!Op.isReg())
2824  continue;
2825 
2826  unsigned Reg = Op.getReg();
2827  if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2828  ErrInfo = "inlineasm operand has incorrect register class.";
2829  return false;
2830  }
2831  }
2832 
2833  return true;
2834  }
2835 
2836  // Make sure the register classes are correct.
2837  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2838  if (MI.getOperand(i).isFPImm()) {
2839  ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2840  "all fp values to integers.";
2841  return false;
2842  }
2843 
2844  int RegClass = Desc.OpInfo[i].RegClass;
2845 
2846  switch (Desc.OpInfo[i].OperandType) {
2848  if (MI.getOperand(i).isImm()) {
2849  ErrInfo = "Illegal immediate value for operand.";
2850  return false;
2851  }
2852  break;
2855  break;
2862  const MachineOperand &MO = MI.getOperand(i);
2863  if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2864  ErrInfo = "Illegal immediate value for operand.";
2865  return false;
2866  }
2867  break;
2868  }
2871  // Check if this operand is an immediate.
2872  // FrameIndex operands will be replaced by immediates, so they are
2873  // allowed.
2874  if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2875  ErrInfo = "Expected immediate, but got non-immediate";
2876  return false;
2877  }
2879  default:
2880  continue;
2881  }
2882 
2883  if (!MI.getOperand(i).isReg())
2884  continue;
2885 
2886  if (RegClass != -1) {
2887  unsigned Reg = MI.getOperand(i).getReg();
2888  if (Reg == AMDGPU::NoRegister ||
2890  continue;
2891 
2892  const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2893  if (!RC->contains(Reg)) {
2894  ErrInfo = "Operand has incorrect register class.";
2895  return false;
2896  }
2897  }
2898  }
2899 
2900  // Verify SDWA
2901  if (isSDWA(MI)) {
2902  if (!ST.hasSDWA()) {
2903  ErrInfo = "SDWA is not supported on this target";
2904  return false;
2905  }
2906 
2907  int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2908 
2909  const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2910 
2911  for (int OpIdx: OpIndicies) {
2912  if (OpIdx == -1)
2913  continue;
2914  const MachineOperand &MO = MI.getOperand(OpIdx);
2915 
2916  if (!ST.hasSDWAScalar()) {
2917  // Only VGPRS on VI
2918  if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
2919  ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2920  return false;
2921  }
2922  } else {
2923  // No immediates on GFX9
2924  if (!MO.isReg()) {
2925  ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2926  return false;
2927  }
2928  }
2929  }
2930 
2931  if (!ST.hasSDWAOmod()) {
2932  // No omod allowed on VI
2933  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2934  if (OMod != nullptr &&
2935  (!OMod->isImm() || OMod->getImm() != 0)) {
2936  ErrInfo = "OMod not allowed in SDWA instructions on VI";
2937  return false;
2938  }
2939  }
2940 
2941  uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
2942  if (isVOPC(BasicOpcode)) {
2943  if (!ST.hasSDWASdst() && DstIdx != -1) {
2944  // Only vcc allowed as dst on VI for VOPC
2945  const MachineOperand &Dst = MI.getOperand(DstIdx);
2946  if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
2947  ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
2948  return false;
2949  }
2950  } else if (!ST.hasSDWAOutModsVOPC()) {
2951  // No clamp allowed on GFX9 for VOPC
2952  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2953  if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
2954  ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
2955  return false;
2956  }
2957 
2958  // No omod allowed on GFX9 for VOPC
2959  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2960  if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
2961  ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
2962  return false;
2963  }
2964  }
2965  }
2966 
2967  const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
2968  if (DstUnused && DstUnused->isImm() &&
2969  DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
2970  const MachineOperand &Dst = MI.getOperand(DstIdx);
2971  if (!Dst.isReg() || !Dst.isTied()) {
2972  ErrInfo = "Dst register should have tied register";
2973  return false;
2974  }
2975 
2976  const MachineOperand &TiedMO =
2977  MI.getOperand(MI.findTiedOperandIdx(DstIdx));
2978  if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
2979  ErrInfo =
2980  "Dst register should be tied to implicit use of preserved register";
2981  return false;
2982  } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
2983  Dst.getReg() != TiedMO.getReg()) {
2984  ErrInfo = "Dst register should use same physical register as preserved";
2985  return false;
2986  }
2987  }
2988  }
2989 
2990  // Verify MIMG
2991  if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
2992  // Ensure that the return type used is large enough for all the options
2993  // being used TFE/LWE require an extra result register.
2994  const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
2995  if (DMask) {
2996  uint64_t DMaskImm = DMask->getImm();
2997  uint32_t RegCount =
2998  isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
2999  const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
3000  const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
3001  const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
3002 
3003  // Adjust for packed 16 bit values
3004  if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
3005  RegCount >>= 1;
3006 
3007  // Adjust if using LWE or TFE
3008  if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
3009  RegCount += 1;
3010 
3011  const uint32_t DstIdx =
3012  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
3013  const MachineOperand &Dst = MI.getOperand(DstIdx);
3014  if (Dst.isReg()) {
3015  const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
3016  uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
3017  if (RegCount > DstSize) {
3018  ErrInfo = "MIMG instruction returns too many registers for dst "
3019  "register class";
3020  return false;
3021  }
3022  }
3023  }
3024  }
3025 
3026  // Verify VOP*. Ignore multiple sgpr operands on writelane.
3027  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
3028  && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
3029  // Only look at the true operands. Only a real operand can use the constant
3030  // bus, and we don't want to check pseudo-operands like the source modifier
3031  // flags.
3032  const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
3033 
3034  unsigned ConstantBusCount = 0;
3035  unsigned LiteralCount = 0;
3036 
3037  if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
3038  ++ConstantBusCount;
3039 
3040  unsigned SGPRUsed = findImplicitSGPRRead(MI);
3041  if (SGPRUsed != AMDGPU::NoRegister)
3042  ++ConstantBusCount;
3043 
3044  for (int OpIdx : OpIndices) {
3045  if (OpIdx == -1)
3046  break;
3047  const MachineOperand &MO = MI.getOperand(OpIdx);
3048  if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
3049  if (MO.isReg()) {
3050  if (MO.getReg() != SGPRUsed)
3051  ++ConstantBusCount;
3052  SGPRUsed = MO.getReg();
3053  } else {
3054  ++ConstantBusCount;
3055  ++LiteralCount;
3056  }
3057  }
3058  }
3059  if (ConstantBusCount > 1) {
3060  ErrInfo = "VOP* instruction uses the constant bus more than once";
3061  return false;
3062  }
3063 
3064  if (isVOP3(MI) && LiteralCount) {
3065  ErrInfo = "VOP3 instruction uses literal";
3066  return false;
3067  }
3068  }
3069 
3070  // Verify misc. restrictions on specific instructions.
3071  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
3072  Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
3073  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3074  const MachineOperand &Src1 = MI.getOperand(Src1Idx);
3075  const MachineOperand &Src2 = MI.getOperand(Src2Idx);
3076  if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
3077  if (!compareMachineOp(Src0, Src1) &&
3078  !compareMachineOp(Src0, Src2)) {
3079  ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
3080  return false;
3081  }
3082  }
3083  }
3084 
3085  if (isSOPK(MI)) {
3086  int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
3087  if (sopkIsZext(MI)) {
3088  if (!isUInt<16>(Imm)) {
3089  ErrInfo = "invalid immediate for SOPK instruction";
3090  return false;
3091  }
3092  } else {
3093  if (!isInt<16>(Imm)) {
3094  ErrInfo = "invalid immediate for SOPK instruction";
3095  return false;
3096  }
3097  }
3098  }
3099 
3100  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
3101  Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
3102  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3103  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
3104  const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3105  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
3106 
3107  const unsigned StaticNumOps = Desc.getNumOperands() +
3108  Desc.getNumImplicitUses();
3109  const unsigned NumImplicitOps = IsDst ? 2 : 1;
3110 
3111  // Allow additional implicit operands. This allows a fixup done by the post
3112  // RA scheduler where the main implicit operand is killed and implicit-defs
3113  // are added for sub-registers that remain live after this instruction.
3114  if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
3115  ErrInfo = "missing implicit register operands";
3116  return false;
3117  }
3118 
3119  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3120  if (IsDst) {
3121  if (!Dst->isUse()) {
3122  ErrInfo = "v_movreld_b32 vdst should be a use operand";
3123  return false;
3124  }
3125 
3126  unsigned UseOpIdx;
3127  if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
3128  UseOpIdx != StaticNumOps + 1) {
3129  ErrInfo = "movrel implicit operands should be tied";
3130  return false;
3131  }
3132  }
3133 
3134  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3135  const MachineOperand &ImpUse
3136  = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
3137  if (!ImpUse.isReg() || !ImpUse.isUse() ||
3138  !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
3139  ErrInfo = "src0 should be subreg of implicit vector use";
3140  return false;
3141  }
3142  }
3143 
3144  // Make sure we aren't losing exec uses in the td files. This mostly requires
3145  // being careful when using let Uses to try to add other use registers.
3146  if (shouldReadExec(MI)) {
3147  if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
3148  ErrInfo = "VALU instruction does not implicitly read exec mask";
3149  return false;
3150  }
3151  }
3152 
3153  if (isSMRD(MI)) {
3154  if (MI.mayStore()) {
3155  // The register offset form of scalar stores may only use m0 as the
3156  // soffset register.
3157  const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
3158  if (Soff && Soff->getReg() != AMDGPU::M0) {
3159  ErrInfo = "scalar stores must use m0 as offset register";
3160  return false;
3161  }
3162  }
3163  }
3164 
3165  if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
3166  const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3167  if (Offset->getImm() != 0) {
3168  ErrInfo = "subtarget does not support offsets in flat instructions";
3169  return false;
3170  }
3171  }
3172 
3173  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
3174  if (DppCt) {
3175  using namespace AMDGPU::DPP;
3176 
3177  unsigned DC = DppCt->getImm();
3178  if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
3179  DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
3184  ErrInfo = "Invalid dpp_ctrl value";
3185  return false;
3186  }
3187  }
3188 
3189  return true;
3190 }
3191 
3192 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
3193  switch (MI.getOpcode()) {
3194  default: return AMDGPU::INSTRUCTION_LIST_END;
3195  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
3196  case AMDGPU::COPY: return AMDGPU::COPY;
3197  case AMDGPU::PHI: return AMDGPU::PHI;
3198  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
3199  case AMDGPU::WQM: return AMDGPU::WQM;
3200  case AMDGPU::WWM: return AMDGPU::WWM;
3201  case AMDGPU::S_MOV_B32:
3202  return MI.getOperand(1).isReg() ?
3203  AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
3204  case AMDGPU::S_ADD_I32:
3205  return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
3206  case AMDGPU::S_ADDC_U32:
3207  return AMDGPU::V_ADDC_U32_e32;
3208  case AMDGPU::S_SUB_I32:
3209  return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
3210  // FIXME: These are not consistently handled, and selected when the carry is
3211  // used.
3212  case AMDGPU::S_ADD_U32:
3213  return AMDGPU::V_ADD_I32_e32;
3214  case AMDGPU::S_SUB_U32:
3215  return AMDGPU::V_SUB_I32_e32;
3216  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
3217  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
3218  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
3219  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
3220  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
3221  case AMDGPU::S_XNOR_B32:
3222  return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
3223  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
3224  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
3225  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
3226  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
3227  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
3228  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
3229  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
3230  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
3231  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
3232  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
3233  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
3234  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
3235  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
3236  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
3237  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
3238  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
3239  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
3240  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
3241  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
3242  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
3243  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
3244  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
3245  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
3246  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
3247  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
3248  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
3249  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
3250  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
3251  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
3252  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
3253  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
3254  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
3255  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
3256  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
3257  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
3258  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
3259  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
3260  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
3261  }
3262 }
3263 
3265  unsigned OpNo) const {
3267  const MCInstrDesc &Desc = get(MI.getOpcode());
3268  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
3269  Desc.OpInfo[OpNo].RegClass == -1) {
3270  unsigned Reg = MI.getOperand(OpNo).getReg();
3271 
3273  return MRI.getRegClass(Reg);
3274  return RI.getPhysRegClass(Reg);
3275  }
3276 
3277  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
3278  return RI.getRegClass(RCID);
3279 }
3280 
3281 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
3283  MachineBasicBlock *MBB = MI.getParent();
3284  MachineOperand &MO = MI.getOperand(OpIdx);
3286  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
3287  const TargetRegisterClass *RC = RI.getRegClass(RCID);
3288  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
3289  if (MO.isReg())
3290  Opcode = AMDGPU::COPY;
3291  else if (RI.isSGPRClass(RC))
3292  Opcode = AMDGPU::S_MOV_B32;
3293 
3294  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
3295  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
3296  VRC = &AMDGPU::VReg_64RegClass;
3297  else
3298  VRC = &AMDGPU::VGPR_32RegClass;
3299 
3300  unsigned Reg = MRI.createVirtualRegister(VRC);
3301  DebugLoc DL = MBB->findDebugLoc(I);
3302  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
3303  MO.ChangeToRegister(Reg, false);
3304 }
3305 
3308  MachineOperand &SuperReg,
3309  const TargetRegisterClass *SuperRC,
3310  unsigned SubIdx,
3311  const TargetRegisterClass *SubRC)
3312  const {
3313  MachineBasicBlock *MBB = MI->getParent();
3314  DebugLoc DL = MI->getDebugLoc();
3315  unsigned SubReg = MRI.createVirtualRegister(SubRC);
3316 
3317  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
3318  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3319  .addReg(SuperReg.getReg(), 0, SubIdx);
3320  return SubReg;
3321  }
3322 
3323  // Just in case the super register is itself a sub-register, copy it to a new
3324  // value so we don't need to worry about merging its subreg index with the
3325  // SubIdx passed to this function. The register coalescer should be able to
3326  // eliminate this extra copy.
3327  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
3328 
3329  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
3330  .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
3331 
3332  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3333  .addReg(NewSuperReg, 0, SubIdx);
3334 
3335  return SubReg;
3336 }
3337 
3341  MachineOperand &Op,
3342  const TargetRegisterClass *SuperRC,
3343  unsigned SubIdx,
3344  const TargetRegisterClass *SubRC) const {
3345  if (Op.isImm()) {
3346  if (SubIdx == AMDGPU::sub0)
3347  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
3348  if (SubIdx == AMDGPU::sub1)
3349  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
3350 
3351  llvm_unreachable("Unhandled register index for immediate");
3352  }
3353 
3354  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
3355  SubIdx, SubRC);
3356  return MachineOperand::CreateReg(SubReg, false);
3357 }
3358 
3359 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
3360 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
3361  assert(Inst.getNumExplicitOperands() == 3);
3362  MachineOperand Op1 = Inst.getOperand(1);
3363  Inst.RemoveOperand(1);
3364  Inst.addOperand(Op1);
3365 }
3366 
3368  const MCOperandInfo &OpInfo,
3369  const MachineOperand &MO) const {
3370  if (!MO.isReg())
3371  return false;
3372 
3373  unsigned Reg = MO.getReg();
3374  const TargetRegisterClass *RC =
3376  MRI.getRegClass(Reg) :
3377  RI.getPhysRegClass(Reg);
3378 
3379  const SIRegisterInfo *TRI =
3380  static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3381  RC = TRI->getSubRegClass(RC, MO.getSubReg());
3382 
3383  // In order to be legal, the common sub-class must be equal to the
3384  // class of the current operand. For example:
3385  //
3386  // v_mov_b32 s0 ; Operand defined as vsrc_b32
3387  // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
3388  //
3389  // s_sendmsg 0, s0 ; Operand defined as m0reg
3390  // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
3391 
3392  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
3393 }
3394 
3396  const MCOperandInfo &OpInfo,
3397  const MachineOperand &MO) const {
3398  if (MO.isReg())
3399  return isLegalRegOperand(MRI, OpInfo, MO);
3400 
3401  // Handle non-register types that are treated like immediates.
3402  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
3403  return true;
3404 }
3405 
3406 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
3407  const MachineOperand *MO) const {
3409  const MCInstrDesc &InstDesc = MI.getDesc();
3410  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
3411  const TargetRegisterClass *DefinedRC =
3412  OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
3413  if (!MO)
3414  MO = &MI.getOperand(OpIdx);
3415 
3416  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
3417 
3418  RegSubRegPair SGPRUsed;
3419  if (MO->isReg())
3420  SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
3421 
3422  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3423  if (i == OpIdx)
3424  continue;
3425  const MachineOperand &Op = MI.getOperand(i);
3426  if (Op.isReg()) {
3427  if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
3428  usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
3429  return false;
3430  }
3431  } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
3432  return false;
3433  }
3434  }
3435  }
3436 
3437  if (MO->isReg()) {
3438  assert(DefinedRC);
3439  return isLegalRegOperand(MRI, OpInfo, *MO);
3440  }
3441 
3442  // Handle non-register types that are treated like immediates.
3443  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
3444 
3445  if (!DefinedRC) {
3446  // This operand expects an immediate.
3447  return true;
3448  }
3449 
3450  return isImmOperandLegal(MI, OpIdx, *MO);
3451 }
3452 
3454  MachineInstr &MI) const {
3455  unsigned Opc = MI.getOpcode();
3456  const MCInstrDesc &InstrDesc = get(Opc);
3457 
3458  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3459  MachineOperand &Src1 = MI.getOperand(Src1Idx);
3460 
3461  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
3462  // we need to only have one constant bus use.
3463  //
3464  // Note we do not need to worry about literal constants here. They are
3465  // disabled for the operand type for instructions because they will always
3466  // violate the one constant bus use rule.
3467  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3468  if (HasImplicitSGPR) {
3469  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3470  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3471 
3472  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
3473  legalizeOpWithMove(MI, Src0Idx);
3474  }
3475 
3476  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
3477  // both the value to write (src0) and lane select (src1). Fix up non-SGPR
3478  // src0/src1 with V_READFIRSTLANE.
3479  if (Opc == AMDGPU::V_WRITELANE_B32) {
3480  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3481  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3482  const DebugLoc &DL = MI.getDebugLoc();
3483  if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
3484  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3485  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3486  .add(Src0);
3487  Src0.ChangeToRegister(Reg, false);
3488  }
3489  if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
3490  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3491  const DebugLoc &DL = MI.getDebugLoc();
3492  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3493  .add(Src1);
3494  Src1.ChangeToRegister(Reg, false);
3495  }
3496  return;
3497  }
3498 
3499  // VOP2 src0 instructions support all operand types, so we don't need to check
3500  // their legality. If src1 is already legal, we don't need to do anything.
3501  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3502  return;
3503 
3504  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3505  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3506  // select is uniform.
3507  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
3508  RI.isVGPR(MRI, Src1.getReg())) {
3509  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3510  const DebugLoc &DL = MI.getDebugLoc();
3511  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3512  .add(Src1);
3513  Src1.ChangeToRegister(Reg, false);
3514  return;
3515  }
3516 
3517  // We do not use commuteInstruction here because it is too aggressive and will
3518  // commute if it is possible. We only want to commute here if it improves
3519  // legality. This can be called a fairly large number of times so don't waste
3520  // compile time pointlessly swapping and checking legality again.
3521  if (HasImplicitSGPR || !MI.isCommutable()) {
3522  legalizeOpWithMove(MI, Src1Idx);
3523  return;
3524  }
3525 
3526  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3527  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3528 
3529  // If src0 can be used as src1, commuting will make the operands legal.
3530  // Otherwise we have to give up and insert a move.
3531  //
3532  // TODO: Other immediate-like operand kinds could be commuted if there was a
3533  // MachineOperand::ChangeTo* for them.
3534  if ((!Src1.isImm() && !Src1.isReg()) ||
3535  !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
3536  legalizeOpWithMove(MI, Src1Idx);
3537  return;
3538  }
3539 
3540  int CommutedOpc = commuteOpcode(MI);
3541  if (CommutedOpc == -1) {
3542  legalizeOpWithMove(MI, Src1Idx);
3543  return;
3544  }
3545 
3546  MI.setDesc(get(CommutedOpc));
3547 
3548  unsigned Src0Reg = Src0.getReg();
3549  unsigned Src0SubReg = Src0.getSubReg();
3550  bool Src0Kill = Src0.isKill();
3551 
3552  if (Src1.isImm())
3553  Src0.ChangeToImmediate(Src1.getImm());
3554  else if (Src1.isReg()) {
3555  Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
3556  Src0.setSubReg(Src1.getSubReg());
3557  } else
3558  llvm_unreachable("Should only have register or immediate operands");
3559 
3560  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3561  Src1.setSubReg(Src0SubReg);
3562 }
3563 
3564 // Legalize VOP3 operands. Because all operand types are supported for any
3565 // operand, and since literal constants are not allowed and should never be
3566 // seen, we only need to worry about inserting copies if we use multiple SGPR
3567 // operands.
3569  MachineInstr &MI) const {
3570  unsigned Opc = MI.getOpcode();
3571 
3572  int VOP3Idx[3] = {
3573  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3574  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3575  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3576  };
3577 
3578  // Find the one SGPR operand we are allowed to use.
3579  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3580 
3581  for (unsigned i = 0; i < 3; ++i) {
3582  int Idx = VOP3Idx[i];
3583  if (Idx == -1)
3584  break;
3585  MachineOperand &MO = MI.getOperand(Idx);
3586 
3587  // We should never see a VOP3 instruction with an illegal immediate operand.
3588  if (!MO.isReg())
3589  continue;
3590 
3591  if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
3592  continue; // VGPRs are legal
3593 
3594  if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
3595  SGPRReg = MO.getReg();
3596  // We can use one SGPR in each VOP3 instruction.
3597  continue;
3598  }
3599 
3600  // If we make it this far, then the operand is not legal and we must
3601  // legalize it.
3602  legalizeOpWithMove(MI, Idx);
3603  }
3604 }
3605 
3607  MachineRegisterInfo &MRI) const {
3608  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3609  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3610  unsigned DstReg = MRI.createVirtualRegister(SRC);
3611  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3612 
3613  if (SubRegs == 1) {
3614  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3615  get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
3616  .addReg(SrcReg);
3617  return DstReg;
3618  }
3619 
3621  for (unsigned i = 0; i < SubRegs; ++i) {
3622  unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3623  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3624  get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3625  .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3626  SRegs.push_back(SGPR);
3627  }
3628 
3629  MachineInstrBuilder MIB =
3630  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3631  get(AMDGPU::REG_SEQUENCE), DstReg);
3632  for (unsigned i = 0; i < SubRegs; ++i) {
3633  MIB.addReg(SRegs[i]);
3634  MIB.addImm(RI.getSubRegFromChannel(i));
3635  }
3636  return DstReg;
3637 }
3638 
3640  MachineInstr &MI) const {
3641 
3642  // If the pointer is store in VGPRs, then we need to move them to
3643  // SGPRs using v_readfirstlane. This is safe because we only select
3644  // loads with uniform pointers to SMRD instruction so we know the
3645  // pointer value is uniform.
3646  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3647  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3648  unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3649  SBase->setReg(SGPR);
3650  }
3651  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
3652  if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
3653  unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
3654  SOff->setReg(SGPR);
3655  }
3656 }
3657 
3660  const TargetRegisterClass *DstRC,
3661  MachineOperand &Op,
3663  const DebugLoc &DL) const {
3664  unsigned OpReg = Op.getReg();
3665  unsigned OpSubReg = Op.getSubReg();
3666 
3667  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3668  RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3669 
3670  // Check if operand is already the correct register class.
3671  if (DstRC == OpRC)
3672  return;
3673 
3674  unsigned DstReg = MRI.createVirtualRegister(DstRC);
3675  MachineInstr *Copy =
3676  BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3677 
3678  Op.setReg(DstReg);
3679  Op.setSubReg(0);
3680 
3681  MachineInstr *Def = MRI.getVRegDef(OpReg);
3682  if (!Def)
3683  return;
3684 
3685  // Try to eliminate the copy if it is copying an immediate value.
3686  if (Def->isMoveImmediate())
3687  FoldImmediate(*Copy, *Def, OpReg, &MRI);
3688 }
3689 
3690 // Emit the actual waterfall loop, executing the wrapped instruction for each
3691 // unique value of \p Rsrc across all lanes. In the best case we execute 1
3692 // iteration, in the worst case we execute 64 (once per lane).
3693 static void
3695  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3696  const DebugLoc &DL, MachineOperand &Rsrc) {
3697  MachineBasicBlock::iterator I = LoopBB.begin();
3698 
3699  unsigned VRsrc = Rsrc.getReg();
3700  unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
3701 
3702  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3703  unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3704  unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3705  unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3706  unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3707  unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3708  unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3709  unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3710  unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3711 
3712  // Beginning of the loop, read the next Rsrc variant.
3713  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
3714  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
3715  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
3716  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
3717  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
3718  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
3719  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
3720  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
3721 
3722  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
3723  .addReg(SRsrcSub0)
3724  .addImm(AMDGPU::sub0)
3725  .addReg(SRsrcSub1)
3726  .addImm(AMDGPU::sub1)
3727  .addReg(SRsrcSub2)
3728  .addImm(AMDGPU::sub2)
3729  .addReg(SRsrcSub3)
3730  .addImm(AMDGPU::sub3);
3731 
3732  // Update Rsrc operand to use the SGPR Rsrc.
3733  Rsrc.setReg(SRsrc);
3734  Rsrc.setIsKill(true);
3735 
3736  // Identify all lanes with identical Rsrc operands in their VGPRs.
3737  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
3738  .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
3739  .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
3740  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
3741  .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
3742  .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
3743  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
3744  .addReg(CondReg0)
3745  .addReg(CondReg1);
3746 
3747  MRI.setSimpleHint(SaveExec, AndCond);
3748 
3749  // Update EXEC to matching lanes, saving original to SaveExec.
3750  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
3751  .addReg(AndCond, RegState::Kill);
3752 
3753  // The original instruction is here; we insert the terminators after it.
3754  I = LoopBB.end();
3755 
3756  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3757  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
3758  .addReg(AMDGPU::EXEC)
3759  .addReg(SaveExec);
3760  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
3761 }
3762 
3763 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
3764 // with SGPRs by iterating over all unique values across all lanes.
3766  MachineOperand &Rsrc, MachineDominatorTree *MDT) {
3767  MachineBasicBlock &MBB = *MI.getParent();
3768  MachineFunction &MF = *MBB.getParent();
3771  const DebugLoc &DL = MI.getDebugLoc();
3772 
3773  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3774 
3775  // Save the EXEC mask
3776  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
3777  .addReg(AMDGPU::EXEC);
3778 
3779  // Killed uses in the instruction we are waterfalling around will be
3780  // incorrect due to the added control-flow.
3781  for (auto &MO : MI.uses()) {
3782  if (MO.isReg() && MO.isUse()) {
3783  MRI.clearKillFlags(MO.getReg());
3784  }
3785  }
3786 
3787  // To insert the loop we need to split the block. Move everything after this
3788  // point to a new block, and insert a new empty block between the two.
3790  MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
3791  MachineFunction::iterator MBBI(MBB);
3792  ++MBBI;
3793 
3794  MF.insert(MBBI, LoopBB);
3795  MF.insert(MBBI, RemainderBB);
3796 
3797  LoopBB->addSuccessor(LoopBB);
3798  LoopBB->addSuccessor(RemainderBB);
3799 
3800  // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
3802  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3803  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3804  LoopBB->splice(LoopBB->begin(), &MBB, J);
3805 
3806  MBB.addSuccessor(LoopBB);
3807 
3808  // Update dominators. We know that MBB immediately dominates LoopBB, that
3809  // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
3810  // dominates all of the successors transferred to it from MBB that MBB used
3811  // to dominate.
3812  if (MDT) {
3813  MDT->addNewBlock(LoopBB, &MBB);
3814  MDT->addNewBlock(RemainderBB, LoopBB);
3815  for (auto &Succ : RemainderBB->successors()) {
3816  if (MDT->dominates(&MBB, Succ)) {
3817  MDT->changeImmediateDominator(Succ, RemainderBB);
3818  }
3819  }
3820  }
3821 
3822  emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
3823 
3824  // Restore the EXEC mask
3825  MachineBasicBlock::iterator First = RemainderBB->begin();
3826  BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3827  .addReg(SaveExec);
3828 }
3829 
3830 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
3831 static std::tuple<unsigned, unsigned>
3833  MachineBasicBlock &MBB = *MI.getParent();
3834  MachineFunction &MF = *MBB.getParent();
3836 
3837  // Extract the ptr from the resource descriptor.
3838  unsigned RsrcPtr =
3839  TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
3840  AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3841 
3842  // Create an empty resource descriptor
3843  unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3844  unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3845  unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3846  unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3847  uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
3848 
3849  // Zero64 = 0
3850  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
3851  .addImm(0);
3852 
3853  // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3854  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3855  .addImm(RsrcDataFormat & 0xFFFFFFFF);
3856 
3857  // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3858  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3859  .addImm(RsrcDataFormat >> 32);
3860 
3861  // NewSRsrc = {Zero64, SRsrcFormat}
3862  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3863  .addReg(Zero64)
3864  .addImm(AMDGPU::sub0_sub1)
3865  .addReg(SRsrcFormatLo)
3866  .addImm(AMDGPU::sub2)
3867  .addReg(SRsrcFormatHi)
3868  .addImm(AMDGPU::sub3);
3869 
3870  return std::make_tuple(RsrcPtr, NewSRsrc);
3871 }
3872 
3874  MachineDominatorTree *MDT) const {
3875  MachineFunction &MF = *MI.getParent()->getParent();
3877 
3878  // Legalize VOP2
3879  if (isVOP2(MI) || isVOPC(MI)) {
3880  legalizeOperandsVOP2(MRI, MI);
3881  return;
3882  }
3883 
3884  // Legalize VOP3
3885  if (isVOP3(MI)) {
3886  legalizeOperandsVOP3(MRI, MI);
3887  return;
3888  }
3889 
3890  // Legalize SMRD
3891  if (isSMRD(MI)) {
3892  legalizeOperandsSMRD(MRI, MI);
3893  return;
3894  }
3895 
3896  // Legalize REG_SEQUENCE and PHI
3897  // The register class of the operands much be the same type as the register
3898  // class of the output.
3899  if (MI.getOpcode() == AMDGPU::PHI) {
3900  const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3901  for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
3902  if (!MI.getOperand(i).isReg() ||
3904  continue;
3905  const TargetRegisterClass *OpRC =
3906  MRI.getRegClass(MI.getOperand(i).getReg());
3907  if (RI.hasVGPRs(OpRC)) {
3908  VRC = OpRC;
3909  } else {
3910  SRC = OpRC;
3911  }
3912  }
3913 
3914  // If any of the operands are VGPR registers, then they all most be
3915  // otherwise we will create illegal VGPR->SGPR copies when legalizing
3916  // them.
3917  if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
3918  if (!VRC) {
3919  assert(SRC);
3920  VRC = RI.getEquivalentVGPRClass(SRC);
3921  }
3922  RC = VRC;
3923  } else {
3924  RC = SRC;
3925  }
3926 
3927  // Update all the operands so they have the same type.
3928  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3929  MachineOperand &Op = MI.getOperand(I);
3931  continue;
3932 
3933  // MI is a PHI instruction.
3934  MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3935  MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3936 
3937  // Avoid creating no-op copies with the same src and dst reg class. These
3938  // confuse some of the machine passes.
3939  legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
3940  }
3941  }
3942 
3943  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
3944  // VGPR dest type and SGPR sources, insert copies so all operands are
3945  // VGPRs. This seems to help operand folding / the register coalescer.
3946  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
3947  MachineBasicBlock *MBB = MI.getParent();
3948  const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
3949  if (RI.hasVGPRs(DstRC)) {
3950  // Update all the operands so they are VGPR register classes. These may
3951  // not be the same register class because REG_SEQUENCE supports mixing
3952  // subregister index types e.g. sub0_sub1 + sub2 + sub3
3953  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3954  MachineOperand &Op = MI.getOperand(I);
3956  continue;
3957 
3958  const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
3959  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
3960  if (VRC == OpRC)
3961  continue;
3962 
3963  legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
3964  Op.setIsKill();
3965  }
3966  }
3967 
3968  return;
3969  }
3970 
3971  // Legalize INSERT_SUBREG
3972  // src0 must have the same register class as dst
3973  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
3974  unsigned Dst = MI.getOperand(0).getReg();
3975  unsigned Src0 = MI.getOperand(1).getReg();
3976  const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
3977  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
3978  if (DstRC != Src0RC) {
3979  MachineBasicBlock *MBB = MI.getParent();
3980  MachineOperand &Op = MI.getOperand(1);
3981  legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
3982  }
3983  return;
3984  }
3985 
3986  // Legalize SI_INIT_M0
3987  if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
3988  MachineOperand &Src = MI.getOperand(0);
3989  if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
3990  Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
3991  return;
3992  }
3993 
3994  // Legalize MIMG and MUBUF/MTBUF for shaders.
3995  //
3996  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
3997  // scratch memory access. In both cases, the legalization never involves
3998  // conversion to the addr64 form.
3999  if (isMIMG(MI) ||
4001  (isMUBUF(MI) || isMTBUF(MI)))) {
4002  MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
4003  if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
4004  unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
4005  SRsrc->setReg(SGPR);
4006  }
4007 
4008  MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
4009  if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
4010  unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
4011  SSamp->setReg(SGPR);
4012  }
4013  return;
4014  }
4015 
4016  // Legalize MUBUF* instructions.
4017  int RsrcIdx =
4018  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
4019  if (RsrcIdx != -1) {
4020  // We have an MUBUF instruction
4021  MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
4022  unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
4023  if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
4024  RI.getRegClass(RsrcRC))) {
4025  // The operands are legal.
4026  // FIXME: We may need to legalize operands besided srsrc.
4027  return;
4028  }
4029 
4030  // Legalize a VGPR Rsrc.
4031  //
4032  // If the instruction is _ADDR64, we can avoid a waterfall by extracting
4033  // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
4034  // a zero-value SRsrc.
4035  //
4036  // If the instruction is _OFFSET (both idxen and offen disabled), and we
4037  // support ADDR64 instructions, we can convert to ADDR64 and do the same as
4038  // above.
4039  //
4040  // Otherwise we are on non-ADDR64 hardware, and/or we have
4041  // idxen/offen/bothen and we fall back to a waterfall loop.
4042 
4043  MachineBasicBlock &MBB = *MI.getParent();
4044 
4045  MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4046  if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
4047  // This is already an ADDR64 instruction so we need to add the pointer
4048  // extracted from the resource descriptor to the current value of VAddr.
4049  unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4050  unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4051  unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4052 
4053  unsigned RsrcPtr, NewSRsrc;
4054  std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4055 
4056  // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
4057  DebugLoc DL = MI.getDebugLoc();
4058  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
4059  .addReg(RsrcPtr, 0, AMDGPU::sub0)
4060  .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
4061 
4062  // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
4063  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
4064  .addReg(RsrcPtr, 0, AMDGPU::sub1)
4065  .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
4066 
4067  // NewVaddr = {NewVaddrHi, NewVaddrLo}
4068  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
4069  .addReg(NewVAddrLo)
4070  .addImm(AMDGPU::sub0)
4071  .addReg(NewVAddrHi)
4072  .addImm(AMDGPU::sub1);
4073 
4074  VAddr->setReg(NewVAddr);
4075  Rsrc->setReg(NewSRsrc);
4076  } else if (!VAddr && ST.hasAddr64()) {
4077  // This instructions is the _OFFSET variant, so we need to convert it to
4078  // ADDR64.
4079  assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
4081  "FIXME: Need to emit flat atomics here");
4082 
4083  unsigned RsrcPtr, NewSRsrc;
4084  std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4085 
4086  unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4087  MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
4088  MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4089  MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
4090  unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
4091 
4092  // Atomics rith return have have an additional tied operand and are
4093  // missing some of the special bits.
4094  MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
4095  MachineInstr *Addr64;
4096 
4097  if (!VDataIn) {
4098  // Regular buffer load / store.
4099  MachineInstrBuilder MIB =
4100  BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4101  .add(*VData)
4102  .addReg(NewVAddr)
4103  .addReg(NewSRsrc)
4104  .add(*SOffset)
4105  .add(*Offset);
4106 
4107  // Atomics do not have this operand.
4108  if (const MachineOperand *GLC =
4109  getNamedOperand(MI, AMDGPU::OpName::glc)) {
4110  MIB.addImm(GLC->getImm());
4111  }
4112 
4113  MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
4114 
4115  if (const MachineOperand *TFE =
4116  getNamedOperand(MI, AMDGPU::OpName::tfe)) {
4117  MIB.addImm(TFE->getImm());
4118  }
4119 
4120  MIB.cloneMemRefs(MI);
4121  Addr64 = MIB;
4122  } else {
4123  // Atomics with return.
4124  Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4125  .add(*VData)
4126  .add(*VDataIn)
4127  .addReg(NewVAddr)
4128  .addReg(NewSRsrc)
4129  .add(*SOffset)
4130  .add(*Offset)
4131  .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
4132  .cloneMemRefs(MI);
4133  }
4134 
4135  MI.removeFromParent();
4136 
4137  // NewVaddr = {NewVaddrHi, NewVaddrLo}
4138  BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
4139  NewVAddr)
4140  .addReg(RsrcPtr, 0, AMDGPU::sub0)
4141  .addImm(AMDGPU::sub0)
4142  .addReg(RsrcPtr, 0, AMDGPU::sub1)
4143  .addImm(AMDGPU::sub1);
4144  } else {
4145  // This is another variant; legalize Rsrc with waterfall loop from VGPRs
4146  // to SGPRs.
4147  loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
4148  }
4149  }
4150 }
4151 
4153  MachineDominatorTree *MDT) const {
4154  SetVectorType Worklist;
4155  Worklist.insert(&TopInst);
4156 
4157  while (!Worklist.empty()) {
4158  MachineInstr &Inst = *Worklist.pop_back_val();
4159  MachineBasicBlock *MBB = Inst.getParent();
4161 
4162  unsigned Opcode = Inst.getOpcode();
4163  unsigned NewOpcode = getVALUOp(Inst);
4164 
4165  // Handle some special cases
4166  switch (Opcode) {
4167  default:
4168  break;
4169  case AMDGPU::S_ADD_U64_PSEUDO:
4170  case AMDGPU::S_SUB_U64_PSEUDO:
4171  splitScalar64BitAddSub(Worklist, Inst, MDT);
4172  Inst.eraseFromParent();
4173  continue;
4174  case AMDGPU::S_ADD_I32:
4175  case AMDGPU::S_SUB_I32:
4176  // FIXME: The u32 versions currently selected use the carry.
4177  if (moveScalarAddSub(Worklist, Inst, MDT))
4178  continue;
4179 
4180  // Default handling
4181  break;
4182  case AMDGPU::S_AND_B64:
4183  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
4184  Inst.eraseFromParent();
4185  continue;
4186 
4187  case AMDGPU::S_OR_B64:
4188  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
4189  Inst.eraseFromParent();
4190  continue;
4191 
4192  case AMDGPU::S_XOR_B64:
4193  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
4194  Inst.eraseFromParent();
4195  continue;
4196 
4197  case AMDGPU::S_NAND_B64:
4198  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
4199  Inst.eraseFromParent();
4200  continue;
4201 
4202  case AMDGPU::S_NOR_B64:
4203  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
4204  Inst.eraseFromParent();
4205  continue;
4206 
4207  case AMDGPU::S_XNOR_B64:
4208  if (ST.hasDLInsts())
4209  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
4210  else
4211  splitScalar64BitXnor(Worklist, Inst, MDT);
4212  Inst.eraseFromParent();
4213  continue;
4214 
4215  case AMDGPU::S_ANDN2_B64:
4216  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
4217  Inst.eraseFromParent();
4218  continue;
4219 
4220  case AMDGPU::S_ORN2_B64:
4221  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
4222  Inst.eraseFromParent();
4223  continue;
4224 
4225  case AMDGPU::S_NOT_B64:
4226  splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
4227  Inst.eraseFromParent();
4228  continue;
4229 
4230  case AMDGPU::S_BCNT1_I32_B64:
4231  splitScalar64BitBCNT(Worklist, Inst);
4232  Inst.eraseFromParent();
4233  continue;
4234 
4235  case AMDGPU::S_BFE_I64:
4236  splitScalar64BitBFE(Worklist, Inst);
4237  Inst.eraseFromParent();
4238  continue;
4239 
4240  case AMDGPU::S_LSHL_B32:
4241  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4242  NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
4243  swapOperands(Inst);
4244  }
4245  break;
4246  case AMDGPU::S_ASHR_I32:
4247  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4248  NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
4249  swapOperands(Inst);
4250  }
4251  break;
4252  case AMDGPU::S_LSHR_B32:
4253  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4254  NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
4255  swapOperands(Inst);
4256  }
4257  break;
4258  case AMDGPU::S_LSHL_B64:
4259  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4260  NewOpcode = AMDGPU::V_LSHLREV_B64;
4261  swapOperands(Inst);
4262  }
4263  break;
4264  case AMDGPU::S_ASHR_I64:
4265  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4266  NewOpcode = AMDGPU::V_ASHRREV_I64;
4267  swapOperands(Inst);
4268  }
4269  break;
4270  case AMDGPU::S_LSHR_B64:
4271  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4272  NewOpcode = AMDGPU::V_LSHRREV_B64;
4273  swapOperands(Inst);
4274  }
4275  break;
4276 
4277  case AMDGPU::S_ABS_I32:
4278  lowerScalarAbs(Worklist, Inst);
4279  Inst.eraseFromParent();
4280  continue;
4281 
4282  case AMDGPU::S_CBRANCH_SCC0:
4283  case AMDGPU::S_CBRANCH_SCC1:
4284  // Clear unused bits of vcc
4285  BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
4286  AMDGPU::VCC)
4287  .addReg(AMDGPU::EXEC)
4288  .addReg(AMDGPU::VCC);
4289  break;
4290 
4291  case AMDGPU::S_BFE_U64:
4292  case AMDGPU::S_BFM_B64:
4293  llvm_unreachable("Moving this op to VALU not implemented");
4294 
4295  case AMDGPU::S_PACK_LL_B32_B16:
4296  case AMDGPU::S_PACK_LH_B32_B16:
4297  case AMDGPU::S_PACK_HH_B32_B16:
4298  movePackToVALU(Worklist, MRI, Inst);
4299  Inst.eraseFromParent();
4300  continue;
4301 
4302  case AMDGPU::S_XNOR_B32:
4303  lowerScalarXnor(Worklist, Inst);
4304  Inst.eraseFromParent();
4305  continue;
4306 
4307  case AMDGPU::S_NAND_B32:
4308  splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
4309  Inst.eraseFromParent();
4310  continue;
4311 
4312  case AMDGPU::S_NOR_B32:
4313  splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
4314  Inst.eraseFromParent();
4315  continue;
4316 
4317  case AMDGPU::S_ANDN2_B32:
4318  splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
4319  Inst.eraseFromParent();
4320  continue;
4321 
4322  case AMDGPU::S_ORN2_B32:
4323  splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
4324  Inst.eraseFromParent();
4325  continue;
4326  }
4327 
4328  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
4329  // We cannot move this instruction to the VALU, so we should try to
4330  // legalize its operands instead.
4331  legalizeOperands(Inst, MDT);
4332  continue;
4333  }
4334 
4335  // Use the new VALU Opcode.
4336  const MCInstrDesc &NewDesc = get(NewOpcode);
4337  Inst.setDesc(NewDesc);
4338 
4339  // Remove any references to SCC. Vector instructions can't read from it, and
4340  // We're just about to add the implicit use / defs of VCC, and we don't want
4341  // both.
4342  for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
4343  MachineOperand &Op = Inst.getOperand(i);
4344  if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
4345  Inst.RemoveOperand(i);
4346  addSCCDefUsersToVALUWorklist(Inst, Worklist);
4347  }
4348  }
4349 
4350  if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
4351  // We are converting these to a BFE, so we need to add the missing
4352  // operands for the size and offset.
4353  unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
4356 
4357  } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
4358  // The VALU version adds the second operand to the result, so insert an
4359  // extra 0 operand.
4361  }
4362 
4364 
4365  if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
4366  const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
4367  // If we need to move this to VGPRs, we need to unpack the second operand
4368  // back into the 2 separate ones for bit offset and width.
4369  assert(OffsetWidthOp.isImm() &&
4370  "Scalar BFE is only implemented for constant width and offset");
4371  uint32_t Imm = OffsetWidthOp.getImm();
4372 
4373  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4374  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4375  Inst.RemoveOperand(2); // Remove old immediate.
4376  Inst.addOperand(MachineOperand::CreateImm(Offset));
4377  Inst.addOperand(MachineOperand::CreateImm(BitWidth));
4378  }
4379 
4380  bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
4381  unsigned NewDstReg = AMDGPU::NoRegister;
4382  if (HasDst) {
4383  unsigned DstReg = Inst.getOperand(0).getReg();
4385  continue;
4386 
4387  // Update the destination register class.
4388  const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
4389  if (!NewDstRC)
4390  continue;
4391 
4392  if (Inst.isCopy() &&
4394  NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
4395  // Instead of creating a copy where src and dst are the same register
4396  // class, we just replace all uses of dst with src. These kinds of
4397  // copies interfere with the heuristics MachineSink uses to decide
4398  // whether or not to split a critical edge. Since the pass assumes
4399  // that copies will end up as machine instructions and not be
4400  // eliminated.
4401  addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
4402  MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
4403  MRI.clearKillFlags(Inst.getOperand(1).getReg());
4404  Inst.getOperand(0).setReg(DstReg);
4405 
4406  // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
4407  // these are deleted later, but at -O0 it would leave a suspicious
4408  // looking illegal copy of an undef register.
4409  for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
4410  Inst.RemoveOperand(I);
4411  Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
4412  continue;
4413  }
4414 
4415  NewDstReg = MRI.createVirtualRegister(NewDstRC);
4416  MRI.replaceRegWith(DstReg, NewDstReg);
4417  }
4418 
4419  // Legalize the operands
4420  legalizeOperands(Inst, MDT);
4421 
4422  if (HasDst)
4423  addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
4424  }
4425 }
4426 
4427 // Add/sub require special handling to deal with carry outs.
4428 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
4429  MachineDominatorTree *MDT) const {
4430  if (ST.hasAddNoCarry()) {
4431  // Assume there is no user of scc since we don't select this in that case.
4432  // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
4433  // is used.
4434 
4435  MachineBasicBlock &MBB = *Inst.getParent();
4437 
4438  unsigned OldDstReg = Inst.getOperand(0).getReg();
4439  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4440 
4441  unsigned Opc = Inst.getOpcode();
4442  assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
4443 
4444  unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
4445  AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
4446 
4447  assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
4448  Inst.RemoveOperand(3);
4449 
4450  Inst.setDesc(get(NewOpc));
4451  Inst.addImplicitDefUseOperands(*MBB.getParent());
4452  MRI.replaceRegWith(OldDstReg, ResultReg);
4453  legalizeOperands(Inst, MDT);
4454 
4455  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4456  return true;
4457  }
4458 
4459  return false;
4460 }
4461 
4462 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
4463  MachineInstr &Inst) const {
4464  MachineBasicBlock &MBB = *Inst.getParent();
4466  MachineBasicBlock::iterator MII = Inst;
4467  DebugLoc DL = Inst.getDebugLoc();
4468 
4469  MachineOperand &Dest = Inst.getOperand(0);
4470  MachineOperand &Src = Inst.getOperand(1);
4471  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4472  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4473 
4474  unsigned SubOp = ST.hasAddNoCarry() ?
4475  AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
4476 
4477  BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
4478  .addImm(0)
4479  .addReg(Src.getReg());
4480 
4481  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
4482  .addReg(Src.getReg())
4483  .addReg(TmpReg);
4484 
4485  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4486  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4487 }
4488 
4489 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
4490  MachineInstr &Inst) const {
4491  MachineBasicBlock &MBB = *Inst.getParent();
4493  MachineBasicBlock::iterator MII = Inst;
4494  const DebugLoc &DL = Inst.getDebugLoc();
4495 
4496  MachineOperand &Dest = Inst.getOperand(0);
4497  MachineOperand &Src0 = Inst.getOperand(1);
4498  MachineOperand &Src1 = Inst.getOperand(2);
4499 
4500  if (ST.hasDLInsts()) {
4501  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4502  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
4503  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
4504 
4505  BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
4506  .add(Src0)
4507  .add(Src1);
4508 
4509  MRI.replaceRegWith(Dest.getReg(), NewDest);
4510  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4511  } else {
4512  // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
4513  // invert either source and then perform the XOR. If either source is a
4514  // scalar register, then we can leave the inversion on the scalar unit to
4515  // acheive a better distrubution of scalar and vector instructions.
4516  bool Src0IsSGPR = Src0.isReg() &&
4517  RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
4518  bool Src1IsSGPR = Src1.isReg() &&
4519  RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
4520  MachineInstr *Not = nullptr;
4521  MachineInstr *Xor = nullptr;
4522  unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4523  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4524 
4525  // Build a pair of scalar instructions and add them to the work list.
4526  // The next iteration over the work list will lower these to the vector
4527  // unit as necessary.
4528  if (Src0IsSGPR) {
4529  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
4530  .add(Src0);
4531  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
4532  .addReg(Temp)
4533  .add(Src1);
4534  } else if (Src1IsSGPR) {
4535  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
4536  .add(Src1);
4537  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
4538  .add(Src0)
4539  .addReg(Temp);
4540  } else {
4541  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
4542  .add(Src0)
4543  .add(Src1);
4544  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
4545  .addReg(Temp);
4546  Worklist.insert(Not);
4547  }
4548 
4549  MRI.replaceRegWith(Dest.getReg(), NewDest);
4550 
4551  Worklist.insert(Xor);
4552 
4553  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4554  }
4555 }
4556 
4557 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
4558  MachineInstr &Inst,
4559  unsigned Opcode) const {
4560  MachineBasicBlock &MBB = *Inst.getParent();
4562  MachineBasicBlock::iterator MII = Inst;
4563  const DebugLoc &DL = Inst.getDebugLoc();
4564 
4565  MachineOperand &Dest = Inst.getOperand(0);
4566  MachineOperand &Src0 = Inst.getOperand(1);
4567  MachineOperand &Src1 = Inst.getOperand(2);
4568 
4569  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4570  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4571 
4572  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
4573  .add(Src0)
4574  .add(Src1);
4575 
4576  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
4577  .addReg(Interm);
4578 
4579  Worklist.insert(&Op);
4580  Worklist.insert(&Not);
4581 
4582  MRI.replaceRegWith(Dest.getReg(), NewDest);
4583  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4584 }
4585 
4586 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
4587  MachineInstr &Inst,
4588  unsigned Opcode) const {
4589  MachineBasicBlock &MBB = *Inst.getParent();
4591  MachineBasicBlock::iterator MII = Inst;
4592  const DebugLoc &DL = Inst.getDebugLoc();
4593 
4594  MachineOperand &Dest = Inst.getOperand(0);
4595  MachineOperand &Src0 = Inst.getOperand(1);
4596  MachineOperand &Src1 = Inst.getOperand(2);
4597 
4598  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4599  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4600 
4601  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
4602  .add(Src1);
4603 
4604  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
4605  .add(Src0)
4606  .addReg(Interm);
4607 
4608  Worklist.insert(&Not);
4609  Worklist.insert(&Op);
4610 
4611  MRI.replaceRegWith(Dest.getReg(), NewDest);
4612  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4613 }
4614 
4615 void SIInstrInfo::splitScalar64BitUnaryOp(
4616  SetVectorType &Worklist, MachineInstr &Inst,
4617  unsigned Opcode) const {
4618  MachineBasicBlock &MBB = *Inst.getParent();
4620 
4621  MachineOperand &Dest = Inst.getOperand(0);
4622  MachineOperand &Src0 = Inst.getOperand(1);
4623  DebugLoc DL = Inst.getDebugLoc();
4624 
4625  MachineBasicBlock::iterator MII = Inst;
4626 
4627  const MCInstrDesc &InstDesc = get(Opcode);
4628  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4629  MRI.getRegClass(Src0.getReg()) :
4630  &AMDGPU::SGPR_32RegClass;
4631 
4632  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4633 
4634  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4635  AMDGPU::sub0, Src0SubRC);
4636 
4637  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4638  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4639  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4640 
4641  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4642  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
4643 
4644  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4645  AMDGPU::sub1, Src0SubRC);
4646 
4647  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4648  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
4649 
4650  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4651  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4652  .addReg(DestSub0)
4653  .addImm(AMDGPU::sub0)
4654  .addReg(DestSub1)
4655  .addImm(AMDGPU::sub1);
4656 
4657  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4658 
4659  Worklist.insert(&LoHalf);
4660  Worklist.insert(&HiHalf);
4661 
4662  // We don't need to legalizeOperands here because for a single operand, src0
4663  // will support any kind of input.
4664 
4665  // Move all users of this moved value.
4666  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4667 }
4668 
4669 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
4670  MachineInstr &Inst,
4671  MachineDominatorTree *MDT) const {
4672  bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4673 
4674  MachineBasicBlock &MBB = *Inst.getParent();
4676 
4677  unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4678  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4679  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4680 
4681  unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4682  unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4683 
4684  MachineOperand &Dest = Inst.getOperand(0);
4685  MachineOperand &Src0 = Inst.getOperand(1);
4686  MachineOperand &Src1 = Inst.getOperand(2);
4687  const DebugLoc &DL = Inst.getDebugLoc();
4688  MachineBasicBlock::iterator MII = Inst;
4689 
4690  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
4691  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
4692  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4693  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4694 
4695  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4696  AMDGPU::sub0, Src0SubRC);
4697  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4698  AMDGPU::sub0, Src1SubRC);
4699 
4700 
4701  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4702  AMDGPU::sub1, Src0SubRC);
4703  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4704  AMDGPU::sub1, Src1SubRC);
4705 
4706  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
4707  MachineInstr *LoHalf =
4708  BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
4709  .addReg(CarryReg, RegState::Define)
4710  .add(SrcReg0Sub0)
4711  .add(SrcReg1Sub0);
4712 
4713  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4714  MachineInstr *HiHalf =
4715  BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
4716  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4717  .add(SrcReg0Sub1)
4718  .add(SrcReg1Sub1)
4719  .addReg(CarryReg, RegState::Kill);
4720 
4721  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4722  .addReg(DestSub0)
4723  .addImm(AMDGPU::sub0)
4724  .addReg(DestSub1)
4725  .addImm(AMDGPU::sub1);
4726 
4727  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4728 
4729  // Try to legalize the operands in case we need to swap the order to keep it
4730  // valid.
4731  legalizeOperands(*LoHalf, MDT);
4732  legalizeOperands(*HiHalf, MDT);
4733 
4734  // Move all users of this moved vlaue.
4735  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4736 }
4737 
4738 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
4739  MachineInstr &Inst, unsigned Opcode,
4740  MachineDominatorTree *MDT) const {
4741  MachineBasicBlock &MBB = *Inst.getParent();
4743 
4744  MachineOperand &Dest = Inst.getOperand(0);
4745  MachineOperand &Src0 = Inst.getOperand(1);
4746  MachineOperand &Src1 = Inst.getOperand(2);
4747  DebugLoc DL = Inst.getDebugLoc();
4748 
4749  MachineBasicBlock::iterator MII = Inst;
4750 
4751  const MCInstrDesc &InstDesc = get(Opcode);
4752  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4753  MRI.getRegClass(Src0.getReg()) :
4754  &AMDGPU::SGPR_32RegClass;
4755 
4756  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4757  const TargetRegisterClass *Src1RC = Src1.isReg() ?
4758  MRI.getRegClass(Src1.getReg()) :
4759  &AMDGPU::SGPR_32RegClass;
4760 
4761  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4762 
4763  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4764  AMDGPU::sub0, Src0SubRC);
4765  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4766  AMDGPU::sub0, Src1SubRC);
4767  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4768  AMDGPU::sub1, Src0SubRC);
4769  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4770  AMDGPU::sub1, Src1SubRC);
4771 
4772  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4773  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4774  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4775 
4776  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4777  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
4778  .add(SrcReg0Sub0)
4779  .add(SrcReg1Sub0);
4780 
4781  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4782  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
4783  .add(SrcReg0Sub1)
4784  .add(SrcReg1Sub1);
4785 
4786  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4787  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4788  .addReg(DestSub0)
4789  .addImm(AMDGPU::sub0)
4790  .addReg(DestSub1)
4791  .addImm(AMDGPU::sub1);
4792 
4793  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4794 
4795  Worklist.insert(&LoHalf);
4796  Worklist.insert(&HiHalf);
4797 
4798  // Move all users of this moved vlaue.
4799  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4800 }
4801 
4802 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
4803  MachineInstr &Inst,
4804  MachineDominatorTree *MDT) const {
4805  MachineBasicBlock &MBB = *Inst.getParent();
4807 
4808  MachineOperand &Dest = Inst.getOperand(0);
4809  MachineOperand &Src0 = Inst.getOperand(1);
4810  MachineOperand &Src1 = Inst.getOperand(2);
4811  const DebugLoc &DL = Inst.getDebugLoc();
4812 
4813  MachineBasicBlock::iterator MII = Inst;
4814 
4815  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4816 
4817  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4818 
4819  MachineOperand* Op0;
4820  MachineOperand* Op1;
4821 
4822  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
4823  Op0 = &Src0;
4824  Op1 = &Src1;
4825  } else {
4826  Op0 = &Src1;
4827  Op1 = &Src0;
4828  }
4829 
4830  BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
4831  .add(*Op0);
4832 
4833  unsigned NewDest = MRI.createVirtualRegister(DestRC);
4834 
4835  MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
4836  .addReg(Interm)
4837  .add(*Op1);
4838 
4839  MRI.replaceRegWith(Dest.getReg(), NewDest);
4840 
4841  Worklist.insert(&Xor);
4842 }
4843 
4844 void SIInstrInfo::splitScalar64BitBCNT(
4845  SetVectorType &Worklist, MachineInstr &Inst) const {
4846  MachineBasicBlock &MBB = *Inst.getParent();
4848 
4849  MachineBasicBlock::iterator MII = Inst;
4850  const DebugLoc &DL = Inst.getDebugLoc();
4851 
4852  MachineOperand &Dest = Inst.getOperand(0);
4853  MachineOperand &Src = Inst.getOperand(1);
4854 
4855  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
4856  const TargetRegisterClass *SrcRC = Src.isReg() ?
4857  MRI.getRegClass(Src.getReg()) :
4858  &AMDGPU::SGPR_32RegClass;
4859 
4860  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4861  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4862 
4863  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
4864 
4865  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4866  AMDGPU::sub0, SrcSubRC);
4867  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4868  AMDGPU::sub1, SrcSubRC);
4869 
4870  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
4871 
4872  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
4873 
4874  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4875 
4876  // We don't need to legalize operands here. src0 for etiher instruction can be
4877  // an SGPR, and the second input is unused or determined here.
4878  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4879 }
4880 
4881 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
4882  MachineInstr &Inst) const {
4883  MachineBasicBlock &MBB = *Inst.getParent();
4885  MachineBasicBlock::iterator MII = Inst;
4886  const DebugLoc &DL = Inst.getDebugLoc();
4887 
4888  MachineOperand &Dest = Inst.getOperand(0);
4889  uint32_t Imm = Inst.getOperand(2).getImm();
4890  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4891  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4892 
4893  (void) Offset;
4894 
4895  // Only sext_inreg cases handled.
4896  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
4897  Offset == 0 && "Not implemented");
4898 
4899  if (BitWidth < 32) {
4900  unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4901  unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4902  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4903 
4904  BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
4905  .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
4906  .addImm(0)
4907  .addImm(BitWidth);
4908 
4909  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
4910  .addImm(31)
4911  .addReg(MidRegLo);
4912 
4913  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4914  .addReg(MidRegLo)
4915  .addImm(AMDGPU::sub0)
4916  .addReg(MidRegHi)
4917  .addImm(AMDGPU::sub1);
4918 
4919  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4920  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4921  return;
4922  }
4923 
4924  MachineOperand &Src = Inst.getOperand(1);
4925  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4926  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4927 
4928  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
4929  .addImm(31)
4930  .addReg(Src.getReg(), 0, AMDGPU::sub0);
4931 
4932  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4933  .addReg(Src.getReg(), 0, AMDGPU::sub0)
4934  .addImm(AMDGPU::sub0)
4935  .addReg(TmpReg)
4936  .addImm(AMDGPU::sub1);
4937 
4938  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4939  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4940 }
4941 
4942 void SIInstrInfo::addUsersToMoveToVALUWorklist(
4943  unsigned DstReg,
4945  SetVectorType &Worklist) const {
4946  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
4947  E = MRI.use_end(); I != E;) {
4948  MachineInstr &UseMI = *I->getParent();
4949 
4950  unsigned OpNo = 0;
4951 
4952  switch (UseMI.getOpcode()) {
4953  case AMDGPU::COPY:
4954  case AMDGPU::WQM:
4955  case AMDGPU::WWM:
4956  case AMDGPU::REG_SEQUENCE:
4957  case AMDGPU::PHI:
4958  case AMDGPU::INSERT_SUBREG:
4959  break;
4960  default:
4961  OpNo = I.getOperandNo();
4962  break;
4963  }
4964 
4965  if (!RI.hasVGPRs(getOpRegClass(UseMI, OpNo))) {
4966  Worklist.insert(&UseMI);
4967 
4968  do {
4969  ++I;
4970  } while (I != E && I->getParent() == &UseMI);
4971  } else {
4972  ++I;
4973  }
4974  }
4975 }
4976 
4977 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
4978  MachineRegisterInfo &MRI,
4979  MachineInstr &Inst) const {
4980  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4981  MachineBasicBlock *MBB = Inst.getParent();
4982  MachineOperand &Src0 = Inst.getOperand(1);
4983  MachineOperand &Src1 = Inst.getOperand(2);
4984  const DebugLoc &DL = Inst.getDebugLoc();
4985 
4986  switch (Inst.getOpcode()) {
4987  case AMDGPU::S_PACK_LL_B32_B16: {
4988  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4989  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4990 
4991  // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
4992  // 0.
4993  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4994  .addImm(0xffff);
4995 
4996  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
4997  .addReg(ImmReg, RegState::Kill)
4998  .add(Src0);
4999 
5000  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
5001  .add(Src1)
5002  .addImm(16)
5003  .addReg(TmpReg, RegState::Kill);
5004  break;
5005  }
5006  case AMDGPU::S_PACK_LH_B32_B16: {
5007  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5008  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5009  .addImm(0xffff);
5010  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
5011  .addReg(ImmReg, RegState::Kill)
5012  .add(Src0)
5013  .add(Src1);
5014  break;
5015  }
5016  case AMDGPU::S_PACK_HH_B32_B16: {
5017  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5018  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5019  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
5020  .addImm(16)
5021  .add(Src0);
5022  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5023  .addImm(0xffff0000);
5024  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
5025  .add(Src1)
5026  .addReg(ImmReg, RegState::Kill)
5027  .addReg(TmpReg, RegState::Kill);
5028  break;
5029  }
5030  default:
5031  llvm_unreachable("unhandled s_pack_* instruction");
5032  }
5033 
5034  MachineOperand &Dest = Inst.getOperand(0);
5035  MRI.replaceRegWith(Dest.getReg(), ResultReg);
5036  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
5037 }
5038 
5039 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
5040  MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
5041  // This assumes that all the users of SCC are in the same block
5042  // as the SCC def.
5043  for (MachineInstr &MI :
5045  SCCDefInst.getParent()->end())) {
5046  // Exit if we find another SCC def.
5047  if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
5048  return;
5049 
5050  if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
5051  Worklist.insert(&MI);
5052  }
5053 }
5054 
5055 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
5056  const MachineInstr &Inst) const {
5057  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
5058 
5059  switch (Inst.getOpcode()) {
5060  // For target instructions, getOpRegClass just returns the virtual register
5061  // class associated with the operand, so we need to find an equivalent VGPR
5062  // register class in order to move the instruction to the VALU.
5063  case AMDGPU::COPY:
5064  case AMDGPU::PHI:
5065  case AMDGPU::REG_SEQUENCE:
5066  case AMDGPU::INSERT_SUBREG:
5067  case AMDGPU::WQM:
5068  case AMDGPU::WWM:
5069  if (RI.hasVGPRs(NewDstRC))
5070  return nullptr;
5071 
5072  NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
5073  if (!NewDstRC)
5074  return nullptr;
5075  return NewDstRC;
5076  default:
5077  return NewDstRC;
5078  }
5079 }
5080 
5081 // Find the one SGPR operand we are allowed to use.
5082 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
5083  int OpIndices[3]) const {
5084  const MCInstrDesc &Desc = MI.getDesc();
5085 
5086  // Find the one SGPR operand we are allowed to use.
5087  //
5088  // First we need to consider the instruction's operand requirements before
5089  // legalizing. Some operands are required to be SGPRs, such as implicit uses
5090  // of VCC, but we are still bound by the constant bus requirement to only use
5091  // one.
5092  //
5093  // If the operand's class is an SGPR, we can never move it.
5094 
5095  unsigned SGPRReg = findImplicitSGPRRead(MI);
5096  if (SGPRReg != AMDGPU::NoRegister)
5097  return SGPRReg;
5098 
5099  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
5100  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5101 
5102  for (unsigned i = 0; i < 3; ++i) {
5103  int Idx = OpIndices[i];
5104  if (Idx == -1)
5105  break;
5106 
5107  const MachineOperand &MO = MI.getOperand(Idx);
5108  if (!MO.isReg())
5109  continue;
5110 
5111  // Is this operand statically required to be an SGPR based on the operand
5112  // constraints?
5113  const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
5114  bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
5115  if (IsRequiredSGPR)
5116  return MO.getReg();
5117 
5118  // If this could be a VGPR or an SGPR, Check the dynamic register class.
5119  unsigned Reg = MO.getReg();
5120  const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
5121  if (RI.isSGPRClass(RegRC))
5122  UsedSGPRs[i] = Reg;
5123  }
5124 
5125  // We don't have a required SGPR operand, so we have a bit more freedom in
5126  // selecting operands to move.
5127 
5128  // Try to select the most used SGPR. If an SGPR is equal to one of the
5129  // others, we choose that.
5130  //
5131  // e.g.
5132  // V_FMA_F32 v0, s0, s0, s0 -> No moves
5133  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
5134 
5135  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
5136  // prefer those.
5137 
5138  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
5139  if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
5140  SGPRReg = UsedSGPRs[0];
5141  }
5142 
5143  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
5144  if (UsedSGPRs[1] == UsedSGPRs[2])
5145  SGPRReg = UsedSGPRs[1];
5146  }
5147 
5148  return SGPRReg;
5149 }
5150 
5152  unsigned OperandName) const {
5153  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
5154  if (Idx == -1)
5155  return nullptr;
5156 
5157  return &MI.getOperand(Idx);
5158 }
5159 
5161  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
5162  if (ST.isAmdHsaOS()) {
5163  // Set ATC = 1. GFX9 doesn't have this bit.
5164  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5165  RsrcDataFormat |= (1ULL << 56);
5166 
5167  // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
5168  // BTW, it disables TC L2 and therefore decreases performance.
5169  if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
5170  RsrcDataFormat |= (2ULL << 59);
5171  }
5172 
5173  return RsrcDataFormat;
5174 }
5175 
5177  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
5179  0xffffffff; // Size;
5180 
5181  // GFX9 doesn't have ELEMENT_SIZE.
5182  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5183  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
5184  Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
5185  }
5186 
5187  // IndexStride = 64.
5188  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
5189 
5190  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
5191  // Clear them unless we want a huge stride.
5192  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5193  Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
5194 
5195  return Rsrc23;
5196 }
5197 
5199  unsigned Opc = MI.getOpcode();
5200 
5201  return isSMRD(Opc);
5202 }
5203 
5205  unsigned Opc = MI.getOpcode();
5206 
5207  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
5208 }
5209 
5211  int &FrameIndex) const {
5212  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
5213  if (!Addr || !Addr->isFI())
5214  return AMDGPU::NoRegister;
5215 
5216  assert(!MI.memoperands_empty() &&
5218 
5219  FrameIndex = Addr->getIndex();
5220  return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
5221 }
5222 
5224  int &FrameIndex) const {
5225  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
5226  assert(Addr && Addr->isFI());
5227  FrameIndex = Addr->getIndex();
5228  return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
5229 }
5230 
5232  int &FrameIndex) const {
5233  if (!MI.mayLoad())
5234  return AMDGPU::NoRegister;
5235 
5236  if (isMUBUF(MI) || isVGPRSpill(MI))
5237  return isStackAccess(MI, FrameIndex);
5238 
5239  if (isSGPRSpill(MI))
5240  return isSGPRStackAccess(MI, FrameIndex);
5241 
5242  return AMDGPU::NoRegister;
5243 }
5244 
5246  int &FrameIndex) const {
5247  if (!MI.mayStore())
5248  return AMDGPU::NoRegister;
5249 
5250  if (isMUBUF(MI) || isVGPRSpill(MI))
5251  return isStackAccess(MI, FrameIndex);
5252 
5253  if (isSGPRSpill(MI))
5254  return isSGPRStackAccess(MI, FrameIndex);
5255 
5256  return AMDGPU::NoRegister;
5257 }
5258 
5260  unsigned Size = 0;
5263  while (++I != E && I->isInsideBundle()) {
5264  assert(!I->isBundle() && "No nested bundle!");
5265  Size += getInstSizeInBytes(*I);
5266  }
5267 
5268  return Size;
5269 }
5270 
5272  unsigned Opc = MI.getOpcode();
5273  const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
5274  unsigned DescSize = Desc.getSize();
5275 
5276  // If we have a definitive size, we can use it. Otherwise we need to inspect
5277  // the operands to know the size.
5278  if (isFixedSize(MI))
5279  return DescSize;
5280 
5281  // 4-byte instructions may have a 32-bit literal encoded after them. Check
5282  // operands that coud ever be literals.
5283  if (isVALU(MI) || isSALU(MI)) {
5284  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5285  if (Src0Idx == -1)
5286  return DescSize; // No operands.
5287 
5288  if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
5289  return DescSize + 4;
5290 
5291  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);