LLVM  14.0.0git
AMDGPURegisterBankInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
84 
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
87 
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
90 
91 using namespace llvm;
92 using namespace MIPatternMatch;
93 
94 namespace {
95 
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final : public GISelChangeObserver {
98 private:
99  const AMDGPURegisterBankInfo &RBI;
101  const RegisterBank *NewBank;
103 
104 public:
105  ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106  MachineRegisterInfo &MRI_, const RegisterBank *RB)
107  : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
108 
109  ~ApplyRegBankMapping() {
110  for (MachineInstr *MI : NewInsts)
111  applyBank(*MI);
112  }
113 
114  /// Set any registers that don't have a set register class or bank to SALU.
115  void applyBank(MachineInstr &MI) {
116  const unsigned Opc = MI.getOpcode();
117  if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118  Opc == AMDGPU::G_SEXT) {
119  // LegalizerHelper wants to use the basic legalization artifacts when
120  // widening etc. We don't handle selection with vcc in artifact sources,
121  // so we need to use a sslect instead to handle these properly.
122  Register DstReg = MI.getOperand(0).getReg();
123  Register SrcReg = MI.getOperand(1).getReg();
124  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125  if (SrcBank == &AMDGPU::VCCRegBank) {
126  const LLT S32 = LLT::scalar(32);
127  assert(MRI.getType(SrcReg) == LLT::scalar(1));
128  assert(MRI.getType(DstReg) == S32);
129  assert(NewBank == &AMDGPU::VGPRRegBank);
130 
131  // Replace the extension with a select, which really uses the boolean
132  // source.
134  auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135  auto False = B.buildConstant(S32, 0);
136  B.buildSelect(DstReg, SrcReg, True, False);
137  MRI.setRegBank(True.getReg(0), *NewBank);
138  MRI.setRegBank(False.getReg(0), *NewBank);
139  MI.eraseFromParent();
140  }
141 
142  assert(!MRI.getRegClassOrRegBank(DstReg));
143  MRI.setRegBank(DstReg, *NewBank);
144  return;
145  }
146 
147 #ifndef NDEBUG
148  if (Opc == AMDGPU::G_TRUNC) {
149  Register DstReg = MI.getOperand(0).getReg();
150  const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151  assert(DstBank != &AMDGPU::VCCRegBank);
152  }
153 #endif
154 
155  for (MachineOperand &Op : MI.operands()) {
156  if (!Op.isReg())
157  continue;
158 
159  // We may see physical registers if building a real MI
160  Register Reg = Op.getReg();
161  if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162  continue;
163 
164  const RegisterBank *RB = NewBank;
165  if (MRI.getType(Reg) == LLT::scalar(1)) {
166  assert(NewBank == &AMDGPU::VGPRRegBank &&
167  "s1 operands should only be used for vector bools");
168  assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
169  MI.getOpcode() != AMDGPU::G_ANYEXT) &&
170  "not expecting legalization artifacts here");
171  RB = &AMDGPU::VCCRegBank;
172  }
173 
174  MRI.setRegBank(Reg, *RB);
175  }
176  }
177 
178  void erasingInstr(MachineInstr &MI) override {}
179 
180  void createdInstr(MachineInstr &MI) override {
181  // At this point, the instruction was just inserted and has no operands.
182  NewInsts.push_back(&MI);
183  }
184 
185  void changingInstr(MachineInstr &MI) override {}
186  void changedInstr(MachineInstr &MI) override {
187  // FIXME: In principle we should probably add the instruction to NewInsts,
188  // but the way the LegalizerHelper uses the observer, we will always see the
189  // registers we need to set the regbank on also referenced in a new
190  // instruction.
191  }
192 };
193 
194 }
197  Subtarget(ST),
198  TRI(Subtarget.getRegisterInfo()),
199  TII(Subtarget.getInstrInfo()) {
200 
201  // HACK: Until this is fully tablegen'd.
202  static llvm::once_flag InitializeRegisterBankFlag;
203 
204  static auto InitializeRegisterBankOnce = [this]() {
205  assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
206  &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
207  &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
208  (void)this;
209  };
210 
211  llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
212 }
213 
214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
215  unsigned BankID = Bank.getID();
216  return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
217 }
218 
220  const RegisterBank &Src,
221  unsigned Size) const {
222  // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223  if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224  (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
226  }
227 
228  // Bool values are tricky, because the meaning is based on context. The SCC
229  // and VCC banks are for the natural scalar and vector conditions produced by
230  // a compare.
231  //
232  // Legalization doesn't know about the necessary context, so an s1 use may
233  // have been a truncate from an arbitrary value, in which case a copy (lowered
234  // as a compare with 0) needs to be inserted.
235  if (Size == 1 &&
236  (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237  (isVectorRegisterBank(Src) ||
238  Src.getID() == AMDGPU::SGPRRegBankID ||
239  Src.getID() == AMDGPU::VCCRegBankID))
241 
242  // There is no direct copy between AGPRs.
243  if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244  Src.getID() == AMDGPU::AGPRRegBankID)
245  return 4;
246 
247  return RegisterBankInfo::copyCost(Dst, Src, Size);
248 }
249 
251  const ValueMapping &ValMapping,
252  const RegisterBank *CurBank) const {
253  // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254  // VGPR.
255  // FIXME: Is there a better way to do this?
256  if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257  return 10; // This is expensive.
258 
259  assert(ValMapping.NumBreakDowns == 2 &&
260  ValMapping.BreakDown[0].Length == 32 &&
261  ValMapping.BreakDown[0].StartIdx == 0 &&
262  ValMapping.BreakDown[1].Length == 32 &&
263  ValMapping.BreakDown[1].StartIdx == 32 &&
264  ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
265 
266  // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267  // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268  // want.
269 
270  // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271  // alignment restrictions, but this probably isn't important.
272  return 1;
273 }
274 
275 const RegisterBank &
277  LLT Ty) const {
278  if (&RC == &AMDGPU::SReg_1RegClass)
279  return AMDGPU::VCCRegBank;
280 
281  // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282  // VCC-like use.
283  if (TRI->isSGPRClass(&RC)) {
284  // FIXME: This probably came from a copy from a physical register, which
285  // should be inferrrable from the copied to-type. We don't have many boolean
286  // physical register constraints so just assume a normal SGPR for now.
287  if (!Ty.isValid())
288  return AMDGPU::SGPRRegBank;
289 
290  return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
291  }
292 
293  return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
294 }
295 
296 template <unsigned NumOps>
299  const MachineInstr &MI, const MachineRegisterInfo &MRI,
300  const std::array<unsigned, NumOps> RegSrcOpIdx,
301  ArrayRef<OpRegBankEntry<NumOps>> Table) const {
302 
303  InstructionMappings AltMappings;
304 
306 
307  unsigned Sizes[NumOps];
308  for (unsigned I = 0; I < NumOps; ++I) {
309  Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310  Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
311  }
312 
313  for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314  unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315  Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
316  }
317 
318  // getInstrMapping's default mapping uses ID 1, so start at 2.
319  unsigned MappingID = 2;
320  for (const auto &Entry : Table) {
321  for (unsigned I = 0; I < NumOps; ++I) {
322  int OpIdx = RegSrcOpIdx[I];
323  Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
324  }
325 
326  AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
328  Operands.size()));
329  }
330 
331  return AltMappings;
332 }
333 
336  const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337  switch (MI.getIntrinsicID()) {
338  case Intrinsic::amdgcn_readlane: {
339  static const OpRegBankEntry<3> Table[2] = {
340  // Perfectly legal.
341  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
342 
343  // Need a readfirstlane for the index.
344  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
345  };
346 
347  const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348  return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
349  }
350  case Intrinsic::amdgcn_writelane: {
351  static const OpRegBankEntry<4> Table[4] = {
352  // Perfectly legal.
353  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
354 
355  // Need readfirstlane of first op
356  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
357 
358  // Need readfirstlane of second op
359  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
360 
361  // Need readfirstlane of both ops
362  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
363  };
364 
365  // rsrc, voffset, offset
366  const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367  return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368  }
369  default:
371  }
372 }
373 
376  const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
377 
378  switch (MI.getIntrinsicID()) {
379  case Intrinsic::amdgcn_s_buffer_load: {
380  static const OpRegBankEntry<2> Table[4] = {
381  // Perfectly legal.
382  { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
383 
384  // Only need 1 register in loop
385  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
386 
387  // Have to waterfall the resource.
388  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
389 
390  // Have to waterfall the resource, and the offset.
391  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
392  };
393 
394  // rsrc, offset
395  const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396  return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
397  }
398  case Intrinsic::amdgcn_ds_ordered_add:
399  case Intrinsic::amdgcn_ds_ordered_swap: {
400  // VGPR = M0, VGPR
401  static const OpRegBankEntry<3> Table[2] = {
402  // Perfectly legal.
403  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
404 
405  // Need a readfirstlane for m0
406  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
407  };
408 
409  const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410  return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
411  }
412  case Intrinsic::amdgcn_s_sendmsg:
413  case Intrinsic::amdgcn_s_sendmsghalt: {
414  // FIXME: Should have no register for immediate
415  static const OpRegBankEntry<1> Table[2] = {
416  // Perfectly legal.
417  { { AMDGPU::SGPRRegBankID }, 1 },
418 
419  // Need readlane
420  { { AMDGPU::VGPRRegBankID }, 3 }
421  };
422 
423  const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424  return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
425  }
426  default:
428  }
429 }
430 
431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
432  const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
433  return I && I->getMetadata("amdgpu.noclobber");
434 }
435 
436 // FIXME: Returns uniform if there's no source value information. This is
437 // probably wrong.
438 static bool isScalarLoadLegal(const MachineInstr &MI) {
439  if (!MI.hasOneMemOperand())
440  return false;
441 
442  const MachineMemOperand *MMO = *MI.memoperands_begin();
443  const unsigned AS = MMO->getAddrSpace();
444  const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
446  // Require 4-byte alignment.
447  return MMO->getAlign() >= Align(4) &&
448  // Can't do a scalar atomic load.
449  !MMO->isAtomic() &&
450  // Don't use scalar loads for volatile accesses to non-constant address
451  // spaces.
452  (IsConst || !MMO->isVolatile()) &&
453  // Memory must be known constant, or not written before this load.
454  (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
456 }
457 
460  const MachineInstr &MI) const {
461 
462  const MachineFunction &MF = *MI.getParent()->getParent();
463  const MachineRegisterInfo &MRI = MF.getRegInfo();
464 
465 
466  InstructionMappings AltMappings;
467  switch (MI.getOpcode()) {
468  case TargetOpcode::G_CONSTANT: {
469  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470  if (Size == 1) {
471  static const OpRegBankEntry<1> Table[3] = {
472  { { AMDGPU::VGPRRegBankID }, 1 },
473  { { AMDGPU::SGPRRegBankID }, 1 },
474  { { AMDGPU::VCCRegBankID }, 1 }
475  };
476 
477  return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
478  }
479 
481  }
482  case TargetOpcode::G_FCONSTANT:
483  case TargetOpcode::G_FRAME_INDEX:
484  case TargetOpcode::G_GLOBAL_VALUE: {
485  static const OpRegBankEntry<1> Table[2] = {
486  { { AMDGPU::VGPRRegBankID }, 1 },
487  { { AMDGPU::SGPRRegBankID }, 1 }
488  };
489 
490  return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
491  }
492  case TargetOpcode::G_AND:
493  case TargetOpcode::G_OR:
494  case TargetOpcode::G_XOR: {
495  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
496 
497  if (Size == 1) {
498  // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499  const InstructionMapping &SCCMapping = getInstructionMapping(
500  1, 1, getOperandsMapping(
501  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504  3); // Num Operands
505  AltMappings.push_back(&SCCMapping);
506 
507  const InstructionMapping &VCCMapping0 = getInstructionMapping(
508  2, 1, getOperandsMapping(
509  {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512  3); // Num Operands
513  AltMappings.push_back(&VCCMapping0);
514  return AltMappings;
515  }
516 
517  if (Size != 64)
518  break;
519 
520  const InstructionMapping &SSMapping = getInstructionMapping(
521  1, 1, getOperandsMapping(
522  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525  3); // Num Operands
526  AltMappings.push_back(&SSMapping);
527 
528  const InstructionMapping &VVMapping = getInstructionMapping(
529  2, 2, getOperandsMapping(
530  {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533  3); // Num Operands
534  AltMappings.push_back(&VVMapping);
535  break;
536  }
537  case TargetOpcode::G_LOAD:
538  case TargetOpcode::G_ZEXTLOAD:
539  case TargetOpcode::G_SEXTLOAD: {
540  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542  unsigned PtrSize = PtrTy.getSizeInBits();
543  unsigned AS = PtrTy.getAddressSpace();
544 
546  AS != AMDGPUAS::PRIVATE_ADDRESS) &&
548  const InstructionMapping &SSMapping = getInstructionMapping(
549  1, 1, getOperandsMapping(
550  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552  2); // Num Operands
553  AltMappings.push_back(&SSMapping);
554  }
555 
556  const InstructionMapping &VVMapping = getInstructionMapping(
557  2, 1,
559  {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561  2); // Num Operands
562  AltMappings.push_back(&VVMapping);
563 
564  // It may be possible to have a vgpr = load sgpr mapping here, because
565  // the mubuf instructions support this kind of load, but probably for only
566  // gfx7 and older. However, the addressing mode matching in the instruction
567  // selector should be able to do a better job of detecting and selecting
568  // these kinds of loads from the vgpr = load vgpr mapping.
569 
570  return AltMappings;
571 
572  }
573  case TargetOpcode::G_SELECT: {
574  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575  const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580  4); // Num Operands
581  AltMappings.push_back(&SSMapping);
582 
583  const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584  getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588  4); // Num Operands
589  AltMappings.push_back(&VVMapping);
590 
591  return AltMappings;
592  }
593  case TargetOpcode::G_UADDE:
594  case TargetOpcode::G_USUBE:
595  case TargetOpcode::G_SADDE:
596  case TargetOpcode::G_SSUBE: {
597  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
598  const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
600  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
601  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
602  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
603  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
604  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
605  5); // Num Operands
606  AltMappings.push_back(&SSMapping);
607 
608  const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
609  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
610  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
611  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
612  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
613  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
614  5); // Num Operands
615  AltMappings.push_back(&VVMapping);
616  return AltMappings;
617  }
618  case AMDGPU::G_BRCOND: {
619  assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
620 
621  // TODO: Change type to 32 for scalar
622  const InstructionMapping &SMapping = getInstructionMapping(
623  1, 1, getOperandsMapping(
624  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
625  2); // Num Operands
626  AltMappings.push_back(&SMapping);
627 
628  const InstructionMapping &VMapping = getInstructionMapping(
629  1, 1, getOperandsMapping(
630  {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
631  2); // Num Operands
632  AltMappings.push_back(&VMapping);
633  return AltMappings;
634  }
635  case AMDGPU::G_INTRINSIC:
637  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
639  default:
640  break;
641  }
643 }
644 
648  LLT HalfTy,
649  Register Reg) const {
650  assert(HalfTy.getSizeInBits() == 32);
651  MachineRegisterInfo *MRI = B.getMRI();
652  Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
653  Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
654  const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
655  MRI->setRegBank(LoLHS, *Bank);
656  MRI->setRegBank(HiLHS, *Bank);
657 
658  Regs.push_back(LoLHS);
659  Regs.push_back(HiLHS);
660 
661  B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
662  .addDef(LoLHS)
663  .addDef(HiLHS)
664  .addUse(Reg);
665 }
666 
667 /// Replace the current type each register in \p Regs has with \p NewTy
669  LLT NewTy) {
670  for (Register Reg : Regs) {
672  MRI.setType(Reg, NewTy);
673  }
674 }
675 
677  if (Ty.isVector()) {
680  Ty.getElementType());
681  }
682 
683  assert(Ty.getScalarSizeInBits() % 2 == 0);
684  return LLT::scalar(Ty.getScalarSizeInBits() / 2);
685 }
686 
687 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
688 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
689 /// execute the instruction for each unique combination of values in all lanes
690 /// in the wave. The block will be split such that rest of the instructions are
691 /// moved to a new block.
692 ///
693 /// Essentially performs this loop:
694 //
695 /// Save Execution Mask
696 /// For (Lane : Wavefront) {
697 /// Enable Lane, Disable all other lanes
698 /// SGPR = read SGPR value for current lane from VGPR
699 /// VGPRResult[Lane] = use_op SGPR
700 /// }
701 /// Restore Execution Mask
702 ///
703 /// There is additional complexity to try for compare values to identify the
704 /// unique values used.
708  SmallSet<Register, 4> &SGPROperandRegs,
709  MachineRegisterInfo &MRI) const {
710  SmallVector<Register, 4> ResultRegs;
711  SmallVector<Register, 4> InitResultRegs;
712  SmallVector<Register, 4> PhiRegs;
713 
714  // Track use registers which have already been expanded with a readfirstlane
715  // sequence. This may have multiple uses if moving a sequence.
716  DenseMap<Register, Register> WaterfalledRegMap;
717 
718  MachineBasicBlock &MBB = B.getMBB();
719  MachineFunction *MF = &B.getMF();
720 
721  const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
722  const unsigned WaveAndOpc = Subtarget.isWave32() ?
723  AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
724  const unsigned MovTermOpc = Subtarget.isWave32() ?
725  AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
726  const unsigned XorTermOpc = Subtarget.isWave32() ?
727  AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
728  const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
729  AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
730  const unsigned ExecReg = Subtarget.isWave32() ?
731  AMDGPU::EXEC_LO : AMDGPU::EXEC;
732 
733 #ifndef NDEBUG
734  const int OrigRangeSize = std::distance(Range.begin(), Range.end());
735 #endif
736 
737  for (MachineInstr &MI : Range) {
738  for (MachineOperand &Def : MI.defs()) {
739  if (MRI.use_nodbg_empty(Def.getReg()))
740  continue;
741 
742  LLT ResTy = MRI.getType(Def.getReg());
743  const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
744  ResultRegs.push_back(Def.getReg());
745  Register InitReg = B.buildUndef(ResTy).getReg(0);
746  Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
747  InitResultRegs.push_back(InitReg);
748  PhiRegs.push_back(PhiReg);
749  MRI.setRegBank(PhiReg, *DefBank);
750  MRI.setRegBank(InitReg, *DefBank);
751  }
752  }
753 
754  Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
755  Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
756 
757  // Don't bother using generic instructions/registers for the exec mask.
758  B.buildInstr(TargetOpcode::IMPLICIT_DEF)
759  .addDef(InitSaveExecReg);
760 
761  Register PhiExec = MRI.createVirtualRegister(WaveRC);
762  Register NewExec = MRI.createVirtualRegister(WaveRC);
763 
764  // To insert the loop we need to split the block. Move everything before this
765  // point to a new block, and insert a new empty block before this instruction.
767  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
768  MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
770  ++MBBI;
771  MF->insert(MBBI, LoopBB);
772  MF->insert(MBBI, RestoreExecBB);
773  MF->insert(MBBI, RemainderBB);
774 
775  LoopBB->addSuccessor(RestoreExecBB);
776  LoopBB->addSuccessor(LoopBB);
777 
778  // Move the rest of the block into a new block.
779  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
780  RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
781 
782  MBB.addSuccessor(LoopBB);
783  RestoreExecBB->addSuccessor(RemainderBB);
784 
785  B.setInsertPt(*LoopBB, LoopBB->end());
786 
787  B.buildInstr(TargetOpcode::PHI)
788  .addDef(PhiExec)
789  .addReg(InitSaveExecReg)
790  .addMBB(&MBB)
791  .addReg(NewExec)
792  .addMBB(LoopBB);
793 
794  for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
795  B.buildInstr(TargetOpcode::G_PHI)
796  .addDef(std::get<2>(Result))
797  .addReg(std::get<0>(Result)) // Initial value / implicit_def
798  .addMBB(&MBB)
799  .addReg(std::get<1>(Result)) // Mid-loop value.
800  .addMBB(LoopBB);
801  }
802 
803  const DebugLoc &DL = B.getDL();
804 
805  MachineInstr &FirstInst = *Range.begin();
806 
807  // Move the instruction into the loop. Note we moved everything after
808  // Range.end() already into a new block, so Range.end() is no longer valid.
809  LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
810 
811  // Figure out the iterator range after splicing the instructions.
812  MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
813  auto NewEnd = LoopBB->end();
814 
815  MachineBasicBlock::iterator I = Range.begin();
816  B.setInsertPt(*LoopBB, I);
817 
818  Register CondReg;
819 
820  assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
821 
822  for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
823  for (MachineOperand &Op : MI.uses()) {
824  if (!Op.isReg() || Op.isDef())
825  continue;
826 
827  Register OldReg = Op.getReg();
828  if (!SGPROperandRegs.count(OldReg))
829  continue;
830 
831  // See if we already processed this register in another instruction in the
832  // sequence.
833  auto OldVal = WaterfalledRegMap.find(OldReg);
834  if (OldVal != WaterfalledRegMap.end()) {
835  Op.setReg(OldVal->second);
836  continue;
837  }
838 
839  Register OpReg = Op.getReg();
840  LLT OpTy = MRI.getType(OpReg);
841 
842  const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
843  if (OpBank != &AMDGPU::VGPRRegBank) {
844  // Insert copy from AGPR to VGPR before the loop.
845  B.setMBB(MBB);
846  OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
847  MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
848  B.setInstr(*I);
849  }
850 
851  unsigned OpSize = OpTy.getSizeInBits();
852 
853  // Can only do a readlane of 32-bit pieces.
854  if (OpSize == 32) {
855  // Avoid extra copies in the simple case of one 32-bit register.
856  Register CurrentLaneOpReg
857  = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
858  MRI.setType(CurrentLaneOpReg, OpTy);
859 
860  constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
861  // Read the next variant <- also loop target.
862  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
863  CurrentLaneOpReg)
864  .addReg(OpReg);
865 
866  Register NewCondReg = MRI.createVirtualRegister(WaveRC);
867  bool First = CondReg == AMDGPU::NoRegister;
868  if (First)
869  CondReg = NewCondReg;
870 
871  // Compare the just read M0 value to all possible Idx values.
872  B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
873  .addDef(NewCondReg)
874  .addReg(CurrentLaneOpReg)
875  .addReg(OpReg);
876  Op.setReg(CurrentLaneOpReg);
877 
878  if (!First) {
879  Register AndReg = MRI.createVirtualRegister(WaveRC);
880 
881  // If there are multiple operands to consider, and the conditions.
882  B.buildInstr(WaveAndOpc)
883  .addDef(AndReg)
884  .addReg(NewCondReg)
885  .addReg(CondReg);
886  CondReg = AndReg;
887  }
888  } else {
889  LLT S32 = LLT::scalar(32);
890  SmallVector<Register, 8> ReadlanePieces;
891 
892  // The compares can be done as 64-bit, but the extract needs to be done
893  // in 32-bit pieces.
894 
895  bool Is64 = OpSize % 64 == 0;
896 
897  LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
898  unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
899  : AMDGPU::V_CMP_EQ_U32_e64;
900 
901  // The compares can be done as 64-bit, but the extract needs to be done
902  // in 32-bit pieces.
903 
904  // Insert the unmerge before the loop.
905 
906  B.setMBB(MBB);
907  auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
908  B.setInstr(*I);
909 
910  unsigned NumPieces = Unmerge->getNumOperands() - 1;
911  for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
912  Register UnmergePiece = Unmerge.getReg(PieceIdx);
913 
914  Register CurrentLaneOpReg;
915  if (Is64) {
916  Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
917  Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
918 
919  MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
920  MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
921  MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
922 
923  // Read the next variant <- also loop target.
924  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
925  CurrentLaneOpRegLo)
926  .addReg(UnmergePiece, 0, AMDGPU::sub0);
927 
928  // Read the next variant <- also loop target.
929  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
930  CurrentLaneOpRegHi)
931  .addReg(UnmergePiece, 0, AMDGPU::sub1);
932 
933  CurrentLaneOpReg =
934  B.buildMerge(LLT::scalar(64),
935  {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
936  .getReg(0);
937 
938  MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
939 
940  if (OpTy.getScalarSizeInBits() == 64) {
941  // If we need to produce a 64-bit element vector, so use the
942  // merged pieces
943  ReadlanePieces.push_back(CurrentLaneOpReg);
944  } else {
945  // 32-bit element type.
946  ReadlanePieces.push_back(CurrentLaneOpRegLo);
947  ReadlanePieces.push_back(CurrentLaneOpRegHi);
948  }
949  } else {
950  CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
951  MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
952  MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
953 
954  // Read the next variant <- also loop target.
955  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
956  CurrentLaneOpReg)
957  .addReg(UnmergePiece);
958  ReadlanePieces.push_back(CurrentLaneOpReg);
959  }
960 
961  Register NewCondReg = MRI.createVirtualRegister(WaveRC);
962  bool First = CondReg == AMDGPU::NoRegister;
963  if (First)
964  CondReg = NewCondReg;
965 
966  B.buildInstr(CmpOp)
967  .addDef(NewCondReg)
968  .addReg(CurrentLaneOpReg)
969  .addReg(UnmergePiece);
970 
971  if (!First) {
972  Register AndReg = MRI.createVirtualRegister(WaveRC);
973 
974  // If there are multiple operands to consider, and the conditions.
975  B.buildInstr(WaveAndOpc)
976  .addDef(AndReg)
977  .addReg(NewCondReg)
978  .addReg(CondReg);
979  CondReg = AndReg;
980  }
981  }
982 
983  // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
984  // BUILD_VECTOR
985  if (OpTy.isVector()) {
986  auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
987  Op.setReg(Merge.getReg(0));
988  } else {
989  auto Merge = B.buildMerge(OpTy, ReadlanePieces);
990  Op.setReg(Merge.getReg(0));
991  }
992 
993  MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
994  }
995 
996  // Make sure we don't re-process this register again.
997  WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
998  }
999  }
1000 
1001  B.setInsertPt(*LoopBB, LoopBB->end());
1002 
1003  // Update EXEC, save the original EXEC value to VCC.
1004  B.buildInstr(AndSaveExecOpc)
1005  .addDef(NewExec)
1006  .addReg(CondReg, RegState::Kill);
1007 
1008  MRI.setSimpleHint(NewExec, CondReg);
1009 
1010  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1011  B.buildInstr(XorTermOpc)
1012  .addDef(ExecReg)
1013  .addReg(ExecReg)
1014  .addReg(NewExec);
1015 
1016  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1017  // s_cbranch_scc0?
1018 
1019  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1020  B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1021  .addMBB(LoopBB);
1022 
1023  // Save the EXEC mask before the loop.
1024  BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1025  .addReg(ExecReg);
1026 
1027  // Restore the EXEC mask after the loop.
1028  B.setMBB(*RestoreExecBB);
1029  B.buildInstr(MovTermOpc)
1030  .addDef(ExecReg)
1031  .addReg(SaveExecReg);
1032 
1033  // Set the insert point after the original instruction, so any new
1034  // instructions will be in the remainder.
1035  B.setInsertPt(*RemainderBB, RemainderBB->begin());
1036 
1037  return true;
1038 }
1039 
1040 // Return any unique registers used by \p MI at \p OpIndices that need to be
1041 // handled in a waterfall loop. Returns these registers in \p
1042 // SGPROperandRegs. Returns true if there are any operands to handle and a
1043 // waterfall loop is necessary.
1045  SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1046  MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1047  for (unsigned Op : OpIndices) {
1048  assert(MI.getOperand(Op).isUse());
1049  Register Reg = MI.getOperand(Op).getReg();
1050  const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1051  if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1052  SGPROperandRegs.insert(Reg);
1053  }
1054 
1055  // No operands need to be replaced, so no need to loop.
1056  return !SGPROperandRegs.empty();
1057 }
1058 
1061  ArrayRef<unsigned> OpIndices) const {
1062  // Use a set to avoid extra readfirstlanes in the case where multiple operands
1063  // are the same register.
1064  SmallSet<Register, 4> SGPROperandRegs;
1065 
1066  if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1067  return false;
1068 
1069  MachineBasicBlock::iterator I = MI.getIterator();
1070  return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1071  SGPROperandRegs, MRI);
1072 }
1073 
1076  ArrayRef<unsigned> OpIndices) const {
1078  return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1079 }
1080 
1081 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1083  MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1084  Register Reg = MI.getOperand(OpIdx).getReg();
1085  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1086  if (Bank == &AMDGPU::SGPRRegBank)
1087  return;
1088 
1089  LLT Ty = MRI.getType(Reg);
1091 
1092  if (Bank != &AMDGPU::VGPRRegBank) {
1093  // We need to copy from AGPR to VGPR
1094  Reg = B.buildCopy(Ty, Reg).getReg(0);
1095  MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1096  }
1097 
1098  Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1099  B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1100  .addDef(SGPR)
1101  .addReg(Reg);
1102 
1103  MRI.setType(SGPR, Ty);
1104 
1105  const TargetRegisterClass *Constrained =
1106  constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1107  (void)Constrained;
1108  assert(Constrained && "Failed to constrain readfirstlane src reg");
1109 
1110  MI.getOperand(OpIdx).setReg(SGPR);
1111 }
1112 
1113 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1114 /// rest will be in the remainder.
1115 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1116  unsigned TotalSize = Ty.getSizeInBits();
1117  if (!Ty.isVector())
1118  return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1119 
1120  LLT EltTy = Ty.getElementType();
1121  unsigned EltSize = EltTy.getSizeInBits();
1122  assert(FirstSize % EltSize == 0);
1123 
1124  unsigned FirstPartNumElts = FirstSize / EltSize;
1125  unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1126 
1127  return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1128  LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1129 }
1130 
1131 static LLT widen96To128(LLT Ty) {
1132  if (!Ty.isVector())
1133  return LLT::scalar(128);
1134 
1135  LLT EltTy = Ty.getElementType();
1136  assert(128 % EltTy.getSizeInBits() == 0);
1137  return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1138 }
1139 
1141  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1142  MachineRegisterInfo &MRI) const {
1143  Register DstReg = MI.getOperand(0).getReg();
1144  const LLT LoadTy = MRI.getType(DstReg);
1145  unsigned LoadSize = LoadTy.getSizeInBits();
1146  const unsigned MaxNonSmrdLoadSize = 128;
1147 
1148  const RegisterBank *DstBank =
1149  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1150  if (DstBank == &AMDGPU::SGPRRegBank) {
1151  // There are some special cases that we need to look at for 32 bit and 96
1152  // bit SGPR loads otherwise we have nothing to do.
1153  if (LoadSize != 32 && LoadSize != 96)
1154  return false;
1155 
1156  MachineMemOperand *MMO = *MI.memoperands_begin();
1157  const unsigned MemSize = 8 * MMO->getSize();
1158  // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1159  // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1160  // scalar loads should have a load size of 32 but memory access size of less
1161  // than 32.
1162  if (LoadSize == 32 &&
1163  (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1164  return false;
1165 
1166  Register PtrReg = MI.getOperand(1).getReg();
1167 
1168  ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1169  MachineIRBuilder B(MI, O);
1170 
1171  if (LoadSize == 32) {
1172  // This is an extending load from a sub-dword size. Widen the memory
1173  // access size to 4 bytes and clear the extra high bits appropriately
1174  const LLT S32 = LLT::scalar(32);
1175  if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1176  // Must extend the sign bit into higher bits for a G_SEXTLOAD
1177  auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1178  B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1179  } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1180  // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1181  auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1182  B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1183  } else
1184  // We do not need to touch the higher bits for regular loads.
1185  B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1186  } else {
1187  // 96-bit loads are only available for vector loads. We need to split this
1188  // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1189  if (MMO->getAlign() < Align(16)) {
1190  LLT Part64, Part32;
1191  std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1192  auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1193  auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1194 
1195  auto Undef = B.buildUndef(LoadTy);
1196  auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1197  B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1198  } else {
1199  LLT WiderTy = widen96To128(LoadTy);
1200  auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1201  B.buildExtract(MI.getOperand(0), WideLoad, 0);
1202  }
1203  }
1204 
1205  MI.eraseFromParent();
1206  return true;
1207  }
1208 
1209  // 128-bit loads are supported for all instruction types.
1210  if (LoadSize <= MaxNonSmrdLoadSize)
1211  return false;
1212 
1213  SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1214  SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1215 
1216  if (SrcRegs.empty())
1217  SrcRegs.push_back(MI.getOperand(1).getReg());
1218 
1219  assert(LoadSize % MaxNonSmrdLoadSize == 0);
1220 
1221  // RegBankSelect only emits scalar types, so we need to reset the pointer
1222  // operand to a pointer type.
1223  Register BasePtrReg = SrcRegs[0];
1224  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1225  MRI.setType(BasePtrReg, PtrTy);
1226 
1227  unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1228  const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1229  ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1230  MachineIRBuilder B(MI, Observer);
1231  LegalizerHelper Helper(B.getMF(), Observer, B);
1232 
1233  if (LoadTy.isVector()) {
1234  if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1235  return false;
1236  } else {
1237  if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1238  return false;
1239  }
1240 
1241  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1242  return true;
1243 }
1244 
1246  MachineInstr &MI,
1247  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1248  MachineRegisterInfo &MRI) const {
1249  const MachineFunction &MF = *MI.getMF();
1250  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1251  const auto &TFI = *ST.getFrameLowering();
1252 
1253  // Guard in case the stack growth direction ever changes with scratch
1254  // instructions.
1255  if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1256  return false;
1257 
1258  Register Dst = MI.getOperand(0).getReg();
1259  Register AllocSize = MI.getOperand(1).getReg();
1260  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1261 
1262  const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1263 
1264  // TODO: Need to emit a wave reduction to get the maximum size.
1265  if (SizeBank != &AMDGPU::SGPRRegBank)
1266  return false;
1267 
1268  LLT PtrTy = MRI.getType(Dst);
1269  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1270 
1272  Register SPReg = Info->getStackPtrOffsetReg();
1273  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1274  MachineIRBuilder B(MI, ApplyBank);
1275 
1276  auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1277  auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1278 
1279  auto SPCopy = B.buildCopy(PtrTy, SPReg);
1280  if (Alignment > TFI.getStackAlign()) {
1281  auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1282  B.buildMaskLowPtrBits(Dst, PtrAdd,
1283  Log2(Alignment) + ST.getWavefrontSizeLog2());
1284  } else {
1285  B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1286  }
1287 
1288  MI.eraseFromParent();
1289  return true;
1290 }
1291 
1294  MachineRegisterInfo &MRI, int RsrcIdx) const {
1295  const int NumDefs = MI.getNumExplicitDefs();
1296 
1297  // The reported argument index is relative to the IR intrinsic call arguments,
1298  // so we need to shift by the number of defs and the intrinsic ID.
1299  RsrcIdx += NumDefs + 1;
1300 
1301  // Insert copies to VGPR arguments.
1302  applyDefaultMapping(OpdMapper);
1303 
1304  // Fixup any SGPR arguments.
1305  SmallVector<unsigned, 4> SGPRIndexes;
1306  for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1307  if (!MI.getOperand(I).isReg())
1308  continue;
1309 
1310  // If this intrinsic has a sampler, it immediately follows rsrc.
1311  if (I == RsrcIdx || I == RsrcIdx + 1)
1312  SGPRIndexes.push_back(I);
1313  }
1314 
1315  executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1316  return true;
1317 }
1318 
1320  Register Reg) {
1322  if (!Def)
1323  return Reg;
1324 
1325  // TODO: Guard against this being an implicit def
1326  return Def->getOperand(0).getReg();
1327 }
1328 
1329 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1330 // the three offsets (voffset, soffset and instoffset)
1332  const AMDGPURegisterBankInfo &RBI,
1333  Register CombinedOffset, Register &VOffsetReg,
1334  Register &SOffsetReg, int64_t &InstOffsetVal,
1335  Align Alignment) {
1336  const LLT S32 = LLT::scalar(32);
1337  MachineRegisterInfo *MRI = B.getMRI();
1338 
1339  if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1340  uint32_t SOffset, ImmOffset;
1341  if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1342  Alignment)) {
1343  VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1344  SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1345  InstOffsetVal = ImmOffset;
1346 
1347  B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1348  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1349  return SOffset + ImmOffset;
1350  }
1351  }
1352 
1353  Register Base;
1354  unsigned Offset;
1355 
1356  std::tie(Base, Offset) =
1357  AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1358 
1359  uint32_t SOffset, ImmOffset;
1360  if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1361  &RBI.Subtarget, Alignment)) {
1362  if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1363  VOffsetReg = Base;
1364  SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1365  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1366  InstOffsetVal = ImmOffset;
1367  return 0; // XXX - Why is this 0?
1368  }
1369 
1370  // If we have SGPR base, we can use it for soffset.
1371  if (SOffset == 0) {
1372  VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1373  B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1374  SOffsetReg = Base;
1375  InstOffsetVal = ImmOffset;
1376  return 0; // XXX - Why is this 0?
1377  }
1378  }
1379 
1380  // Handle the variable sgpr + vgpr case.
1381  MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1382  if (Add && (int)Offset >= 0) {
1383  Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1384  Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1385 
1386  const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1387  const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1388 
1389  if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1390  VOffsetReg = Src0;
1391  SOffsetReg = Src1;
1392  return 0;
1393  }
1394 
1395  if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1396  VOffsetReg = Src1;
1397  SOffsetReg = Src0;
1398  return 0;
1399  }
1400  }
1401 
1402  // Ensure we have a VGPR for the combined offset. This could be an issue if we
1403  // have an SGPR offset and a VGPR resource.
1404  if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1405  VOffsetReg = CombinedOffset;
1406  } else {
1407  VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1408  B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1409  }
1410 
1411  SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1412  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1413  return 0;
1414 }
1415 
1417  const OperandsMapper &OpdMapper) const {
1418  MachineInstr &MI = OpdMapper.getMI();
1419  MachineRegisterInfo &MRI = OpdMapper.getMRI();
1420 
1421  const LLT S32 = LLT::scalar(32);
1422  Register Dst = MI.getOperand(0).getReg();
1423  LLT Ty = MRI.getType(Dst);
1424 
1425  const RegisterBank *RSrcBank =
1426  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1427  const RegisterBank *OffsetBank =
1428  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1429  if (RSrcBank == &AMDGPU::SGPRRegBank &&
1430  OffsetBank == &AMDGPU::SGPRRegBank)
1431  return true; // Legal mapping
1432 
1433  // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1434  // here but don't have an MMO.
1435 
1436  unsigned LoadSize = Ty.getSizeInBits();
1437  int NumLoads = 1;
1438  if (LoadSize == 256 || LoadSize == 512) {
1439  NumLoads = LoadSize / 128;
1440  Ty = Ty.divide(NumLoads);
1441  }
1442 
1443  // Use the alignment to ensure that the required offsets will fit into the
1444  // immediate offsets.
1445  const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1446 
1448  MachineFunction &MF = B.getMF();
1449 
1450  Register SOffset;
1451  Register VOffset;
1452  int64_t ImmOffset = 0;
1453 
1454  unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1455  VOffset, SOffset, ImmOffset, Alignment);
1456 
1457  // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1458  // can, but we neeed to track an MMO for that.
1459  const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1460  const Align MemAlign(4); // FIXME: ABI type alignment?
1465  MemSize, MemAlign);
1466  if (MMOOffset != 0)
1467  BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1468 
1469  // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1470  // assume that the buffer is unswizzled.
1471 
1472  Register RSrc = MI.getOperand(1).getReg();
1473  Register VIndex = B.buildConstant(S32, 0).getReg(0);
1474  B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1475 
1476  SmallVector<Register, 4> LoadParts(NumLoads);
1477 
1478  MachineBasicBlock::iterator MII = MI.getIterator();
1479  MachineInstrSpan Span(MII, &B.getMBB());
1480 
1481  for (int i = 0; i < NumLoads; ++i) {
1482  if (NumLoads == 1) {
1483  LoadParts[i] = Dst;
1484  } else {
1485  LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1486  MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1487  }
1488 
1489  MachineMemOperand *MMO = BaseMMO;
1490  if (i != 0)
1491  BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1492 
1493  B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1494  .addDef(LoadParts[i]) // vdata
1495  .addUse(RSrc) // rsrc
1496  .addUse(VIndex) // vindex
1497  .addUse(VOffset) // voffset
1498  .addUse(SOffset) // soffset
1499  .addImm(ImmOffset + 16 * i) // offset(imm)
1500  .addImm(0) // cachepolicy, swizzled buffer(imm)
1501  .addImm(0) // idxen(imm)
1502  .addMemOperand(MMO);
1503  }
1504 
1505  // TODO: If only the resource is a VGPR, it may be better to execute the
1506  // scalar load in the waterfall loop if the resource is expected to frequently
1507  // be dynamically uniform.
1508  if (RSrcBank != &AMDGPU::SGPRRegBank) {
1509  // Remove the original instruction to avoid potentially confusing the
1510  // waterfall loop logic.
1511  B.setInstr(*Span.begin());
1512  MI.eraseFromParent();
1513 
1514  SmallSet<Register, 4> OpsToWaterfall;
1515 
1516  OpsToWaterfall.insert(RSrc);
1517  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1518  OpsToWaterfall, MRI);
1519  }
1520 
1521  if (NumLoads != 1) {
1522  if (Ty.isVector())
1523  B.buildConcatVectors(Dst, LoadParts);
1524  else
1525  B.buildMerge(Dst, LoadParts);
1526  }
1527 
1528  // We removed the instruction earlier with a waterfall loop.
1529  if (RSrcBank == &AMDGPU::SGPRRegBank)
1530  MI.eraseFromParent();
1531 
1532  return true;
1533 }
1534 
1536  bool Signed) const {
1537  MachineInstr &MI = OpdMapper.getMI();
1538  MachineRegisterInfo &MRI = OpdMapper.getMRI();
1539 
1540  // Insert basic copies
1541  applyDefaultMapping(OpdMapper);
1542 
1543  Register DstReg = MI.getOperand(0).getReg();
1544  LLT Ty = MRI.getType(DstReg);
1545 
1546  const LLT S32 = LLT::scalar(32);
1547 
1548  unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1549  Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1550  Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1551  Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1552 
1553  const RegisterBank *DstBank =
1554  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1555  if (DstBank == &AMDGPU::VGPRRegBank) {
1556  if (Ty == S32)
1557  return true;
1558 
1559  // There is no 64-bit vgpr bitfield extract instructions so the operation
1560  // is expanded to a sequence of instructions that implement the operation.
1561  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1562  MachineIRBuilder B(MI, ApplyBank);
1563 
1564  const LLT S64 = LLT::scalar(64);
1565  // Shift the source operand so that extracted bits start at bit 0.
1566  auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1567  : B.buildLShr(S64, SrcReg, OffsetReg);
1568  auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1569 
1570  // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1571  // if the width is a constant.
1572  if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1573  // Use the 32-bit bitfield extract instruction if the width is a constant.
1574  // Depending on the width size, use either the low or high 32-bits.
1575  auto Zero = B.buildConstant(S32, 0);
1576  auto WidthImm = ConstWidth->Value.getZExtValue();
1577  if (WidthImm <= 32) {
1578  // Use bitfield extract on the lower 32-bit source, and then sign-extend
1579  // or clear the upper 32-bits.
1580  auto Extract =
1581  Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1582  : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1583  auto Extend =
1584  Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1585  B.buildMerge(DstReg, {Extract, Extend});
1586  } else {
1587  // Use bitfield extract on upper 32-bit source, and combine with lower
1588  // 32-bit source.
1589  auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1590  auto Extract =
1591  Signed
1592  ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1593  : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1594  B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1595  }
1596  MI.eraseFromParent();
1597  return true;
1598  }
1599 
1600  // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1601  // operations.
1602  auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1603  auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1604  if (Signed)
1605  B.buildAShr(S64, SignBit, ExtShift);
1606  else
1607  B.buildLShr(S64, SignBit, ExtShift);
1608  MI.eraseFromParent();
1609  return true;
1610  }
1611 
1612  // The scalar form packs the offset and width in a single operand.
1613 
1614  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1615  MachineIRBuilder B(MI, ApplyBank);
1616 
1617  // Ensure the high bits are clear to insert the offset.
1618  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1619  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1620 
1621  // Zeros out the low bits, so don't bother clamping the input value.
1622  auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1623 
1624  // Transformation function, pack the offset and width of a BFE into
1625  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1626  // source, bits [5:0] contain the offset and bits [22:16] the width.
1627  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1628 
1629  // TODO: It might be worth using a pseudo here to avoid scc clobber and
1630  // register class constraints.
1631  unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1632  (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1633 
1634  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1635  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1636  llvm_unreachable("failed to constrain BFE");
1637 
1638  MI.eraseFromParent();
1639  return true;
1640 }
1641 
1642 // Return a suitable opcode for extending the operands of Opc when widening.
1643 static unsigned getExtendOp(unsigned Opc) {
1644  switch (Opc) {
1645  case TargetOpcode::G_ASHR:
1646  case TargetOpcode::G_SMIN:
1647  case TargetOpcode::G_SMAX:
1648  return TargetOpcode::G_SEXT;
1649  case TargetOpcode::G_LSHR:
1650  case TargetOpcode::G_UMIN:
1651  case TargetOpcode::G_UMAX:
1652  return TargetOpcode::G_ZEXT;
1653  default:
1654  return TargetOpcode::G_ANYEXT;
1655  }
1656 }
1657 
1658 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1659 // any illegal vector extend or unmerge operations.
1660 static std::pair<Register, Register>
1661 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1662  const LLT S32 = LLT::scalar(32);
1663  auto Bitcast = B.buildBitcast(S32, Src);
1664 
1665  if (ExtOpcode == TargetOpcode::G_SEXT) {
1666  auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1667  auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1668  return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1669  }
1670 
1671  auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1672  if (ExtOpcode == TargetOpcode::G_ZEXT) {
1673  auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1674  return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1675  }
1676 
1677  assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1678  return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1679 }
1680 
1681 // For cases where only a single copy is inserted for matching register banks.
1682 // Replace the register in the instruction operand
1684  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1685  SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1686  if (!SrcReg.empty()) {
1687  assert(SrcReg.size() == 1);
1688  OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1689  return true;
1690  }
1691 
1692  return false;
1693 }
1694 
1695 /// Handle register layout difference for f16 images for some subtargets.
1698  Register Reg) const {
1700  return Reg;
1701 
1702  const LLT S16 = LLT::scalar(16);
1703  LLT StoreVT = MRI.getType(Reg);
1704  if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1705  return Reg;
1706 
1707  auto Unmerge = B.buildUnmerge(S16, Reg);
1708 
1709 
1710  SmallVector<Register, 4> WideRegs;
1711  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1712  WideRegs.push_back(Unmerge.getReg(I));
1713 
1714  const LLT S32 = LLT::scalar(32);
1715  int NumElts = StoreVT.getNumElements();
1716 
1717  return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1718 }
1719 
1720 static std::pair<Register, unsigned>
1722  int64_t Const;
1723  if (mi_match(Reg, MRI, m_ICst(Const)))
1724  return std::make_pair(Register(), Const);
1725 
1726  Register Base;
1727  if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1728  return std::make_pair(Base, Const);
1729 
1730  // TODO: Handle G_OR used for add case
1731  return std::make_pair(Reg, 0);
1732 }
1733 
1734 std::pair<Register, unsigned>
1736  Register OrigOffset) const {
1737  const unsigned MaxImm = 4095;
1738  Register BaseReg;
1739  unsigned ImmOffset;
1740  const LLT S32 = LLT::scalar(32);
1741 
1742  std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1743  OrigOffset);
1744 
1745  unsigned C1 = 0;
1746  if (ImmOffset != 0) {
1747  // If the immediate value is too big for the immoffset field, put the value
1748  // and -4096 into the immoffset field so that the value that is copied/added
1749  // for the voffset field is a multiple of 4096, and it stands more chance
1750  // of being CSEd with the copy/add for another similar load/store.
1751  // However, do not do that rounding down to a multiple of 4096 if that is a
1752  // negative number, as it appears to be illegal to have a negative offset
1753  // in the vgpr, even if adding the immediate offset makes it positive.
1754  unsigned Overflow = ImmOffset & ~MaxImm;
1755  ImmOffset -= Overflow;
1756  if ((int32_t)Overflow < 0) {
1757  Overflow += ImmOffset;
1758  ImmOffset = 0;
1759  }
1760 
1761  C1 = ImmOffset;
1762  if (Overflow != 0) {
1763  if (!BaseReg)
1764  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1765  else {
1766  auto OverflowVal = B.buildConstant(S32, Overflow);
1767  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1768  }
1769  }
1770  }
1771 
1772  if (!BaseReg)
1773  BaseReg = B.buildConstant(S32, 0).getReg(0);
1774 
1775  return {BaseReg, C1};
1776 }
1777 
1779  int64_t C;
1780  return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1781 }
1782 
1783 static unsigned extractCPol(unsigned CachePolicy) {
1784  return CachePolicy & AMDGPU::CPol::ALL;
1785 }
1786 
1787 static unsigned extractSWZ(unsigned CachePolicy) {
1788  return (CachePolicy >> 3) & 1;
1789 }
1790 
1791 
1792 MachineInstr *
1794  MachineInstr &MI) const {
1795  MachineRegisterInfo &MRI = *B.getMRI();
1796  executeInWaterfallLoop(B, MI, MRI, {2, 4});
1797 
1798  // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1799 
1800  Register VData = MI.getOperand(1).getReg();
1801  LLT Ty = MRI.getType(VData);
1802 
1803  int EltSize = Ty.getScalarSizeInBits();
1804  int Size = Ty.getSizeInBits();
1805 
1806  // FIXME: Broken integer truncstore.
1807  if (EltSize != 32)
1808  report_fatal_error("unhandled intrinsic store");
1809 
1810  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1811  const int MemSize = (*MI.memoperands_begin())->getSize();
1812 
1813 
1814  Register RSrc = MI.getOperand(2).getReg();
1815  Register VOffset = MI.getOperand(3).getReg();
1816  Register SOffset = MI.getOperand(4).getReg();
1817  unsigned CachePolicy = MI.getOperand(5).getImm();
1818 
1819  unsigned ImmOffset;
1820  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1821 
1822  const bool Offen = !isZero(VOffset, MRI);
1823 
1824  unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1825  switch (8 * MemSize) {
1826  case 8:
1827  Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1828  AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1829  break;
1830  case 16:
1831  Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1832  AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1833  break;
1834  default:
1835  Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1836  AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1837  if (Size > 32)
1838  Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1839  break;
1840  }
1841 
1842 
1843  // Set the insertion point back to the instruction in case it was moved into a
1844  // loop.
1845  B.setInstr(MI);
1846 
1847  MachineInstrBuilder MIB = B.buildInstr(Opc)
1848  .addUse(VData);
1849 
1850  if (Offen)
1851  MIB.addUse(VOffset);
1852 
1853  MIB.addUse(RSrc)
1854  .addUse(SOffset)
1855  .addImm(ImmOffset)
1856  .addImm(extractCPol(CachePolicy))
1857  .addImm(0) // tfe: FIXME: Remove from inst
1858  .addImm(extractSWZ(CachePolicy))
1859  .cloneMemRefs(MI);
1860 
1861  // FIXME: We need a way to report failure from applyMappingImpl.
1862  // Insert constrain copies before inserting the loop.
1863  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1864  report_fatal_error("failed to constrain selected store intrinsic");
1865 
1866  return MIB;
1867 }
1868 
1870  Register SrcReg) const {
1871  MachineRegisterInfo &MRI = *B.getMRI();
1872  LLT SrcTy = MRI.getType(SrcReg);
1873  if (SrcTy.getSizeInBits() == 32) {
1874  // Use a v_mov_b32 here to make the exec dependency explicit.
1875  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1876  .addDef(DstReg)
1877  .addUse(SrcReg);
1878  return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1879  constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1880  }
1881 
1882  Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1883  Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1884 
1885  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1886  .addDef(TmpReg0)
1887  .addUse(SrcReg, 0, AMDGPU::sub0);
1888  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1889  .addDef(TmpReg1)
1890  .addUse(SrcReg, 0, AMDGPU::sub1);
1891  B.buildInstr(AMDGPU::REG_SEQUENCE)
1892  .addDef(DstReg)
1893  .addUse(TmpReg0)
1894  .addImm(AMDGPU::sub0)
1895  .addUse(TmpReg1)
1896  .addImm(AMDGPU::sub1);
1897 
1898  return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1899  constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1900 }
1901 
1902 /// Utility function for pushing dynamic vector indexes with a constant offset
1903 /// into waterwall loops.
1905  MachineInstr &IdxUseInstr,
1906  unsigned OpIdx,
1907  unsigned ConstOffset) {
1908  MachineRegisterInfo &MRI = *B.getMRI();
1909  const LLT S32 = LLT::scalar(32);
1910  Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1911  B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1912 
1913  auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1914 
1915  auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1916  MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1917  MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1918  IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1919 }
1920 
1921 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1922 /// original 32-bit source value (to be inserted in the low part of the combined
1923 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1924 /// value.
1926  Register Hi32Reg, Register Lo32Reg,
1927  unsigned ExtOpc,
1928  const RegisterBank &RegBank,
1929  bool IsBooleanSrc = false) {
1930  if (ExtOpc == AMDGPU::G_ZEXT) {
1931  B.buildConstant(Hi32Reg, 0);
1932  } else if (ExtOpc == AMDGPU::G_SEXT) {
1933  if (IsBooleanSrc) {
1934  // If we know the original source was an s1, the high half is the same as
1935  // the low.
1936  B.buildCopy(Hi32Reg, Lo32Reg);
1937  } else {
1938  // Replicate sign bit from 32-bit extended part.
1939  auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1940  B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1941  B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1942  }
1943  } else {
1944  assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1945  B.buildUndef(Hi32Reg);
1946  }
1947 }
1948 
1949 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1951  const OperandsMapper &OpdMapper) const {
1952 
1953  Register VecReg = MI.getOperand(1).getReg();
1954  Register Idx = MI.getOperand(2).getReg();
1955 
1956  const RegisterBank &IdxBank =
1957  *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1958 
1959  bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1960 
1961  LLT VecTy = MRI.getType(VecReg);
1962  unsigned EltSize = VecTy.getScalarSizeInBits();
1963  unsigned NumElem = VecTy.getNumElements();
1964 
1965  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1966  IsDivergentIdx))
1967  return false;
1968 
1970  LLT S32 = LLT::scalar(32);
1971 
1972  const RegisterBank &DstBank =
1973  *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1974  const RegisterBank &SrcBank =
1975  *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1976 
1977  const RegisterBank &CCBank =
1978  (DstBank == AMDGPU::SGPRRegBank &&
1979  SrcBank == AMDGPU::SGPRRegBank &&
1980  IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1981  : AMDGPU::VCCRegBank;
1982  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1983 
1984  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1985  Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1986  MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1987  }
1988 
1989  LLT EltTy = VecTy.getScalarType();
1990  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1991  unsigned NumLanes = DstRegs.size();
1992  if (!NumLanes)
1993  NumLanes = 1;
1994  else
1995  EltTy = MRI.getType(DstRegs[0]);
1996 
1997  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1998  SmallVector<Register, 2> Res(NumLanes);
1999  for (unsigned L = 0; L < NumLanes; ++L)
2000  Res[L] = UnmergeToEltTy.getReg(L);
2001 
2002  for (unsigned I = 1; I < NumElem; ++I) {
2003  auto IC = B.buildConstant(S32, I);
2004  MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2005  auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2006  MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2007 
2008  for (unsigned L = 0; L < NumLanes; ++L) {
2009  auto S = B.buildSelect(EltTy, Cmp,
2010  UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
2011 
2012  for (unsigned N : { 0, 2, 3 })
2013  MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2014 
2015  Res[L] = S->getOperand(0).getReg();
2016  }
2017  }
2018 
2019  for (unsigned L = 0; L < NumLanes; ++L) {
2020  Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2021  B.buildCopy(DstReg, Res[L]);
2022  MRI.setRegBank(DstReg, DstBank);
2023  }
2024 
2025  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2026  MI.eraseFromParent();
2027 
2028  return true;
2029 }
2030 
2031 // Insert a cross regbank copy for a register if it already has a bank that
2032 // differs from the one we want to set.
2035  const RegisterBank &Bank) {
2036  const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2037  if (CurrBank && *CurrBank != Bank) {
2038  Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2039  MRI.setRegBank(Copy, Bank);
2040  return Copy;
2041  }
2042 
2043  MRI.setRegBank(Reg, Bank);
2044  return Reg;
2045 }
2046 
2047 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2049  const OperandsMapper &OpdMapper) const {
2050 
2051  Register VecReg = MI.getOperand(1).getReg();
2052  Register Idx = MI.getOperand(3).getReg();
2053 
2054  const RegisterBank &IdxBank =
2055  *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2056 
2057  bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2058 
2059  LLT VecTy = MRI.getType(VecReg);
2060  unsigned EltSize = VecTy.getScalarSizeInBits();
2061  unsigned NumElem = VecTy.getNumElements();
2062 
2063  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2064  IsDivergentIdx))
2065  return false;
2066 
2068  LLT S32 = LLT::scalar(32);
2069 
2070  const RegisterBank &DstBank =
2071  *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2072  const RegisterBank &SrcBank =
2073  *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2074  const RegisterBank &InsBank =
2075  *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2076 
2077  const RegisterBank &CCBank =
2078  (DstBank == AMDGPU::SGPRRegBank &&
2079  SrcBank == AMDGPU::SGPRRegBank &&
2080  InsBank == AMDGPU::SGPRRegBank &&
2081  IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2082  : AMDGPU::VCCRegBank;
2083  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2084 
2085  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2086  Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2087  MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2088  }
2089 
2090  LLT EltTy = VecTy.getScalarType();
2091  SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2092  unsigned NumLanes = InsRegs.size();
2093  if (!NumLanes) {
2094  NumLanes = 1;
2095  InsRegs.push_back(MI.getOperand(2).getReg());
2096  } else {
2097  EltTy = MRI.getType(InsRegs[0]);
2098  }
2099 
2100  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2101  SmallVector<Register, 16> Ops(NumElem * NumLanes);
2102 
2103  for (unsigned I = 0; I < NumElem; ++I) {
2104  auto IC = B.buildConstant(S32, I);
2105  MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2106  auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2107  MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2108 
2109  for (unsigned L = 0; L < NumLanes; ++L) {
2110  Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2111  Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2112  Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2113 
2114  Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2115  MRI.setRegBank(Select, DstBank);
2116 
2117  Ops[I * NumLanes + L] = Select;
2118  }
2119  }
2120 
2121  LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2122  if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2123  B.buildBuildVector(MI.getOperand(0), Ops);
2124  } else {
2125  auto Vec = B.buildBuildVector(MergeTy, Ops);
2126  MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2127  B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2128  }
2129 
2130  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2131  MI.eraseFromParent();
2132 
2133  return true;
2134 }
2135 
2137  const OperandsMapper &OpdMapper) const {
2138  MachineInstr &MI = OpdMapper.getMI();
2139  unsigned Opc = MI.getOpcode();
2140  MachineRegisterInfo &MRI = OpdMapper.getMRI();
2141  switch (Opc) {
2142  case AMDGPU::G_PHI: {
2143  Register DstReg = MI.getOperand(0).getReg();
2144  LLT DstTy = MRI.getType(DstReg);
2145  if (DstTy != LLT::scalar(1))
2146  break;
2147 
2148  const LLT S32 = LLT::scalar(32);
2149  const RegisterBank *DstBank =
2150  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2151  if (DstBank == &AMDGPU::VCCRegBank) {
2152  applyDefaultMapping(OpdMapper);
2153  // The standard handling only considers the result register bank for
2154  // phis. For VCC, blindly inserting a copy when the phi is lowered will
2155  // produce an invalid copy. We can only copy with some kind of compare to
2156  // get a vector boolean result. Insert a regitser bank copy that will be
2157  // correctly lowered to a compare.
2158  MachineIRBuilder B(*MI.getParent()->getParent());
2159 
2160  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2161  Register SrcReg = MI.getOperand(I).getReg();
2162  const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2163 
2164  if (SrcBank != &AMDGPU::VCCRegBank) {
2165  MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2166  B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2167 
2168  auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2169  MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2170  MI.getOperand(I).setReg(Copy.getReg(0));
2171  }
2172  }
2173 
2174  return;
2175  }
2176 
2177  // Phi handling is strange and only considers the bank of the destination.
2178  substituteSimpleCopyRegs(OpdMapper, 0);
2179 
2180  // Promote SGPR/VGPR booleans to s32
2181  MachineFunction *MF = MI.getParent()->getParent();
2182  ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2183  MachineIRBuilder B(MI, ApplyBank);
2184  LegalizerHelper Helper(*MF, ApplyBank, B);
2185 
2186  if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2187  llvm_unreachable("widen scalar should have succeeded");
2188 
2189  return;
2190  }
2191  case AMDGPU::G_ICMP:
2192  case AMDGPU::G_UADDO:
2193  case AMDGPU::G_USUBO:
2194  case AMDGPU::G_UADDE:
2195  case AMDGPU::G_SADDE:
2196  case AMDGPU::G_USUBE:
2197  case AMDGPU::G_SSUBE: {
2198  unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2199  Register DstReg = MI.getOperand(BoolDstOp).getReg();
2200 
2201  const RegisterBank *DstBank =
2202  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2203  if (DstBank != &AMDGPU::SGPRRegBank)
2204  break;
2205 
2206  const bool HasCarryIn = MI.getNumOperands() == 5;
2207 
2208  // If this is a scalar compare, promote the result to s32, as the selection
2209  // will end up using a copy to a 32-bit vreg.
2210  const LLT S32 = LLT::scalar(32);
2211  Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2212  MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2213  MI.getOperand(BoolDstOp).setReg(NewDstReg);
2215 
2216  if (HasCarryIn) {
2217  Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2218  MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2219  B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2220  MI.getOperand(4).setReg(NewSrcReg);
2221  }
2222 
2223  MachineBasicBlock *MBB = MI.getParent();
2224  B.setInsertPt(*MBB, std::next(MI.getIterator()));
2225 
2226  // If we had a constrained VCC result register, a copy was inserted to VCC
2227  // from SGPR.
2228  SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2229  if (DefRegs.empty())
2230  DefRegs.push_back(DstReg);
2231  B.buildTrunc(DefRegs[0], NewDstReg);
2232  return;
2233  }
2234  case AMDGPU::G_SELECT: {
2235  Register DstReg = MI.getOperand(0).getReg();
2236  LLT DstTy = MRI.getType(DstReg);
2237 
2238  SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2239  if (CondRegs.empty())
2240  CondRegs.push_back(MI.getOperand(1).getReg());
2241  else {
2242  assert(CondRegs.size() == 1);
2243  }
2244 
2245  const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2246  if (CondBank == &AMDGPU::SGPRRegBank) {
2248  const LLT S32 = LLT::scalar(32);
2249  Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2250  MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2251 
2252  MI.getOperand(1).setReg(NewCondReg);
2253  B.buildZExt(NewCondReg, CondRegs[0]);
2254  }
2255 
2256  if (DstTy.getSizeInBits() != 64)
2257  break;
2258 
2260  LLT HalfTy = getHalfSizedType(DstTy);
2261 
2262  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2263  SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2264  SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2265 
2266  // All inputs are SGPRs, nothing special to do.
2267  if (DefRegs.empty()) {
2268  assert(Src1Regs.empty() && Src2Regs.empty());
2269  break;
2270  }
2271 
2272  if (Src1Regs.empty())
2273  split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2274  else {
2275  setRegsToType(MRI, Src1Regs, HalfTy);
2276  }
2277 
2278  if (Src2Regs.empty())
2279  split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2280  else
2281  setRegsToType(MRI, Src2Regs, HalfTy);
2282 
2283  setRegsToType(MRI, DefRegs, HalfTy);
2284 
2285  B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2286  B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2287 
2288  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2289  MI.eraseFromParent();
2290  return;
2291  }
2292  case AMDGPU::G_BRCOND: {
2293  Register CondReg = MI.getOperand(0).getReg();
2294  // FIXME: Should use legalizer helper, but should change bool ext type.
2295  const RegisterBank *CondBank =
2296  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2297 
2298  if (CondBank == &AMDGPU::SGPRRegBank) {
2300  const LLT S32 = LLT::scalar(32);
2301  Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2302  MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2303 
2304  MI.getOperand(0).setReg(NewCondReg);
2305  B.buildZExt(NewCondReg, CondReg);
2306  return;
2307  }
2308 
2309  break;
2310  }
2311  case AMDGPU::G_AND:
2312  case AMDGPU::G_OR:
2313  case AMDGPU::G_XOR: {
2314  // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2315  // there is a VGPR input.
2316  Register DstReg = MI.getOperand(0).getReg();
2317  LLT DstTy = MRI.getType(DstReg);
2318 
2319  if (DstTy.getSizeInBits() == 1) {
2320  const RegisterBank *DstBank =
2321  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2322  if (DstBank == &AMDGPU::VCCRegBank)
2323  break;
2324 
2325  MachineFunction *MF = MI.getParent()->getParent();
2326  ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2327  MachineIRBuilder B(MI, ApplyBank);
2328  LegalizerHelper Helper(*MF, ApplyBank, B);
2329 
2330  if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2332  llvm_unreachable("widen scalar should have succeeded");
2333  return;
2334  }
2335 
2336  if (DstTy.getSizeInBits() != 64)
2337  break;
2338 
2339  LLT HalfTy = getHalfSizedType(DstTy);
2340  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2341  SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2342  SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2343 
2344  // All inputs are SGPRs, nothing special to do.
2345  if (DefRegs.empty()) {
2346  assert(Src0Regs.empty() && Src1Regs.empty());
2347  break;
2348  }
2349 
2350  assert(DefRegs.size() == 2);
2351  assert(Src0Regs.size() == Src1Regs.size() &&
2352  (Src0Regs.empty() || Src0Regs.size() == 2));
2353 
2354  // Depending on where the source registers came from, the generic code may
2355  // have decided to split the inputs already or not. If not, we still need to
2356  // extract the values.
2358 
2359  if (Src0Regs.empty())
2360  split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2361  else
2362  setRegsToType(MRI, Src0Regs, HalfTy);
2363 
2364  if (Src1Regs.empty())
2365  split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2366  else
2367  setRegsToType(MRI, Src1Regs, HalfTy);
2368 
2369  setRegsToType(MRI, DefRegs, HalfTy);
2370 
2371  B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2372  B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2373 
2374  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2375  MI.eraseFromParent();
2376  return;
2377  }
2378  case AMDGPU::G_ABS: {
2379  Register SrcReg = MI.getOperand(1).getReg();
2380  const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2381 
2382  // There is no VALU abs instruction so we need to replace it with a sub and
2383  // max combination.
2384  if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2385  MachineFunction *MF = MI.getParent()->getParent();
2386  ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2387  MachineIRBuilder B(MI, Apply);
2388  LegalizerHelper Helper(*MF, Apply, B);
2389 
2391  llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2392  return;
2393  }
2395  }
2396  case AMDGPU::G_ADD:
2397  case AMDGPU::G_SUB:
2398  case AMDGPU::G_MUL:
2399  case AMDGPU::G_SHL:
2400  case AMDGPU::G_LSHR:
2401  case AMDGPU::G_ASHR:
2402  case AMDGPU::G_SMIN:
2403  case AMDGPU::G_SMAX:
2404  case AMDGPU::G_UMIN:
2405  case AMDGPU::G_UMAX: {
2406  Register DstReg = MI.getOperand(0).getReg();
2407  LLT DstTy = MRI.getType(DstReg);
2408 
2409  // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2410  // Packed 16-bit operations need to be scalarized and promoted.
2411  if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2412  break;
2413 
2414  const RegisterBank *DstBank =
2415  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2416  if (DstBank == &AMDGPU::VGPRRegBank)
2417  break;
2418 
2419  const LLT S32 = LLT::scalar(32);
2420  MachineBasicBlock *MBB = MI.getParent();
2421  MachineFunction *MF = MBB->getParent();
2422  ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2423  MachineIRBuilder B(MI, ApplySALU);
2424 
2425  if (DstTy.isVector()) {
2426  Register WideSrc0Lo, WideSrc0Hi;
2427  Register WideSrc1Lo, WideSrc1Hi;
2428 
2429  unsigned ExtendOp = getExtendOp(MI.getOpcode());
2430  std::tie(WideSrc0Lo, WideSrc0Hi)
2431  = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2432  std::tie(WideSrc1Lo, WideSrc1Hi)
2433  = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2434  auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2435  auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2436  B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2437  MI.eraseFromParent();
2438  } else {
2439  LegalizerHelper Helper(*MF, ApplySALU, B);
2440 
2441  if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2442  llvm_unreachable("widen scalar should have succeeded");
2443 
2444  // FIXME: s16 shift amounts should be legal.
2445  if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2446  Opc == AMDGPU::G_ASHR) {
2447  B.setInsertPt(*MBB, MI.getIterator());
2448  if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2449  llvm_unreachable("widen scalar should have succeeded");
2450  }
2451  }
2452 
2453  return;
2454  }
2455  case AMDGPU::G_SEXT_INREG: {
2456  SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2457  if (SrcRegs.empty())
2458  break; // Nothing to repair
2459 
2460  const LLT S32 = LLT::scalar(32);
2462  ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2463  GISelObserverWrapper Observer(&O);
2464  B.setChangeObserver(Observer);
2465 
2466  // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2467  // we would need to further expand, and doesn't let us directly set the
2468  // result registers.
2469  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2470 
2471  int Amt = MI.getOperand(2).getImm();
2472  if (Amt <= 32) {
2473  if (Amt == 32) {
2474  // The low bits are unchanged.
2475  B.buildCopy(DstRegs[0], SrcRegs[0]);
2476  } else {
2477  // Extend in the low bits and propagate the sign bit to the high half.
2478  B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2479  }
2480 
2481  B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2482  } else {
2483  // The low bits are unchanged, and extend in the high bits.
2484  B.buildCopy(DstRegs[0], SrcRegs[0]);
2485  B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2486  }
2487 
2488  Register DstReg = MI.getOperand(0).getReg();
2489  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2490  MI.eraseFromParent();
2491  return;
2492  }
2493  case AMDGPU::G_CTPOP:
2494  case AMDGPU::G_BITREVERSE: {
2495  const RegisterBank *DstBank =
2496  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2497  if (DstBank == &AMDGPU::SGPRRegBank)
2498  break;
2499 
2500  Register SrcReg = MI.getOperand(1).getReg();
2501  const LLT S32 = LLT::scalar(32);
2502  LLT Ty = MRI.getType(SrcReg);
2503  if (Ty == S32)
2504  break;
2505 
2506  ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2507  MachineIRBuilder B(MI, ApplyVALU);
2508 
2509  MachineFunction &MF = B.getMF();
2510  LegalizerHelper Helper(MF, ApplyVALU, B);
2511 
2512  if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2513  llvm_unreachable("narrowScalar should have succeeded");
2514  return;
2515  }
2516  case AMDGPU::G_AMDGPU_FFBH_U32:
2517  case AMDGPU::G_AMDGPU_FFBL_B32:
2518  case AMDGPU::G_CTLZ_ZERO_UNDEF:
2519  case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2520  const RegisterBank *DstBank =
2521  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2522  if (DstBank == &AMDGPU::SGPRRegBank)
2523  break;
2524 
2525  Register SrcReg = MI.getOperand(1).getReg();
2526  const LLT S32 = LLT::scalar(32);
2527  LLT Ty = MRI.getType(SrcReg);
2528  if (Ty == S32)
2529  break;
2530 
2531  // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2532  // which return -1 when the input is zero:
2533  // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2534  // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2535  // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2536  // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2537  ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2538  MachineIRBuilder B(MI, ApplyVALU);
2539  SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2540  unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2541  ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2542  : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2543  ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2544  : Opc;
2545  unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2546  auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2547  auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2548  unsigned AddOpc =
2549  Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2550  ? AMDGPU::G_ADD
2551  : AMDGPU::G_UADDSAT;
2552  Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2553  Register DstReg = MI.getOperand(0).getReg();
2554  B.buildUMin(DstReg, X, Y);
2555  MI.eraseFromParent();
2556  return;
2557  }
2558  case AMDGPU::G_SEXT:
2559  case AMDGPU::G_ZEXT:
2560  case AMDGPU::G_ANYEXT: {
2561  Register SrcReg = MI.getOperand(1).getReg();
2562  LLT SrcTy = MRI.getType(SrcReg);
2563  const bool Signed = Opc == AMDGPU::G_SEXT;
2564 
2565  assert(empty(OpdMapper.getVRegs(1)));
2566 
2568  const RegisterBank *SrcBank =
2569  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2570 
2571  Register DstReg = MI.getOperand(0).getReg();
2572  LLT DstTy = MRI.getType(DstReg);
2573  if (DstTy.isScalar() &&
2574  SrcBank != &AMDGPU::SGPRRegBank &&
2575  SrcBank != &AMDGPU::VCCRegBank &&
2576  // FIXME: Should handle any type that round to s64 when irregular
2577  // breakdowns supported.
2578  DstTy.getSizeInBits() == 64 &&
2579  SrcTy.getSizeInBits() <= 32) {
2580  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2581 
2582  // Extend to 32-bit, and then extend the low half.
2583  if (Signed) {
2584  // TODO: Should really be buildSExtOrCopy
2585  B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2586  } else if (Opc == AMDGPU::G_ZEXT) {
2587  B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2588  } else {
2589  B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2590  }
2591 
2592  extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2593  MRI.setRegBank(DstReg, *SrcBank);
2594  MI.eraseFromParent();
2595  return;
2596  }
2597 
2598  if (SrcTy != LLT::scalar(1))
2599  return;
2600 
2601  // It is not legal to have a legalization artifact with a VCC source. Rather
2602  // than introducing a copy, insert the select we would have to select the
2603  // copy to.
2604  if (SrcBank == &AMDGPU::VCCRegBank) {
2605  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2606 
2607  const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2608 
2609  unsigned DstSize = DstTy.getSizeInBits();
2610  // 64-bit select is SGPR only
2611  const bool UseSel64 = DstSize > 32 &&
2612  SrcBank->getID() == AMDGPU::SGPRRegBankID;
2613 
2614  // TODO: Should s16 select be legal?
2615  LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2616  auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2617  auto False = B.buildConstant(SelType, 0);
2618 
2619  MRI.setRegBank(True.getReg(0), *DstBank);
2620  MRI.setRegBank(False.getReg(0), *DstBank);
2621  MRI.setRegBank(DstReg, *DstBank);
2622 
2623  if (DstSize > 32) {
2624  B.buildSelect(DefRegs[0], SrcReg, True, False);
2625  extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2626  } else if (DstSize < 32) {
2627  auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2628  MRI.setRegBank(Sel.getReg(0), *DstBank);
2629  B.buildTrunc(DstReg, Sel);
2630  } else {
2631  B.buildSelect(DstReg, SrcReg, True, False);
2632  }
2633 
2634  MI.eraseFromParent();
2635  return;
2636  }
2637 
2638  break;
2639  }
2640  case AMDGPU::G_BUILD_VECTOR:
2641  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2642  Register DstReg = MI.getOperand(0).getReg();
2643  LLT DstTy = MRI.getType(DstReg);
2644  if (DstTy != LLT::fixed_vector(2, 16))
2645  break;
2646 
2647  assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2648  substituteSimpleCopyRegs(OpdMapper, 1);
2649  substituteSimpleCopyRegs(OpdMapper, 2);
2650 
2651  const RegisterBank *DstBank =
2652  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2653  if (DstBank == &AMDGPU::SGPRRegBank)
2654  break; // Can use S_PACK_* instructions.
2655 
2657 
2658  Register Lo = MI.getOperand(1).getReg();
2659  Register Hi = MI.getOperand(2).getReg();
2660  const LLT S32 = LLT::scalar(32);
2661 
2662  const RegisterBank *BankLo =
2663  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2664  const RegisterBank *BankHi =
2665  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2666 
2667  Register ZextLo;
2668  Register ShiftHi;
2669 
2670  if (Opc == AMDGPU::G_BUILD_VECTOR) {
2671  ZextLo = B.buildZExt(S32, Lo).getReg(0);
2672  MRI.setRegBank(ZextLo, *BankLo);
2673 
2674  Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2675  MRI.setRegBank(ZextHi, *BankHi);
2676 
2677  auto ShiftAmt = B.buildConstant(S32, 16);
2678  MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2679 
2680  ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2681  MRI.setRegBank(ShiftHi, *BankHi);
2682  } else {
2683  Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2684  MRI.setRegBank(MaskLo, *BankLo);
2685 
2686  auto ShiftAmt = B.buildConstant(S32, 16);
2687  MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2688 
2689  ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2690  MRI.setRegBank(ShiftHi, *BankHi);
2691 
2692  ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2693  MRI.setRegBank(ZextLo, *BankLo);
2694  }
2695 
2696  auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2697  MRI.setRegBank(Or.getReg(0), *DstBank);
2698 
2699  B.buildBitcast(DstReg, Or);
2700  MI.eraseFromParent();
2701  return;
2702  }
2703  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2704  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2705 
2706  assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2707 
2708  Register DstReg = MI.getOperand(0).getReg();
2709  Register SrcReg = MI.getOperand(1).getReg();
2710 
2711  const LLT S32 = LLT::scalar(32);
2712  LLT DstTy = MRI.getType(DstReg);
2713  LLT SrcTy = MRI.getType(SrcReg);
2714 
2715  if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2716  return;
2717 
2719 
2720  const ValueMapping &DstMapping
2721  = OpdMapper.getInstrMapping().getOperandMapping(0);
2722  const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2723  const RegisterBank *SrcBank =
2724  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2725  const RegisterBank *IdxBank =
2726  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2727 
2728  Register BaseIdxReg;
2729  unsigned ConstOffset;
2730  std::tie(BaseIdxReg, ConstOffset) =
2731  AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2732 
2733  // See if the index is an add of a constant which will be foldable by moving
2734  // the base register of the index later if this is going to be executed in a
2735  // waterfall loop. This is essentially to reassociate the add of a constant
2736  // with the readfirstlane.
2737  bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2738  ConstOffset > 0 &&
2739  ConstOffset < SrcTy.getNumElements();
2740 
2741  // Move the base register. We'll re-insert the add later.
2742  if (ShouldMoveIndexIntoLoop)
2743  MI.getOperand(2).setReg(BaseIdxReg);
2744 
2745  // If this is a VGPR result only because the index was a VGPR result, the
2746  // actual indexing will be done on the SGPR source vector, which will
2747  // produce a scalar result. We need to copy to the VGPR result inside the
2748  // waterfall loop.
2749  const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2750  SrcBank == &AMDGPU::SGPRRegBank;
2751  if (DstRegs.empty()) {
2752  applyDefaultMapping(OpdMapper);
2753 
2754  executeInWaterfallLoop(MI, MRI, { 2 });
2755 
2756  if (NeedCopyToVGPR) {
2757  // We don't want a phi for this temporary reg.
2758  Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2759  MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2760  MI.getOperand(0).setReg(TmpReg);
2761  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2762 
2763  // Use a v_mov_b32 here to make the exec dependency explicit.
2764  buildVCopy(B, DstReg, TmpReg);
2765  }
2766 
2767  // Re-insert the constant offset add inside the waterfall loop.
2768  if (ShouldMoveIndexIntoLoop)
2769  reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2770 
2771  return;
2772  }
2773 
2774  assert(DstTy.getSizeInBits() == 64);
2775 
2776  LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2777 
2778  auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2779  auto One = B.buildConstant(S32, 1);
2780 
2781  MachineBasicBlock::iterator MII = MI.getIterator();
2782 
2783  // Split the vector index into 32-bit pieces. Prepare to move all of the
2784  // new instructions into a waterfall loop if necessary.
2785  //
2786  // Don't put the bitcast or constant in the loop.
2787  MachineInstrSpan Span(MII, &B.getMBB());
2788 
2789  // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2790  auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2791  auto IdxHi = B.buildAdd(S32, IdxLo, One);
2792 
2793  auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2794  auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2795 
2796  MRI.setRegBank(DstReg, *DstBank);
2797  MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2798  MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2799  MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2800  MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2801 
2802  SmallSet<Register, 4> OpsToWaterfall;
2803  if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2804  MI.eraseFromParent();
2805  return;
2806  }
2807 
2808  // Remove the original instruction to avoid potentially confusing the
2809  // waterfall loop logic.
2810  B.setInstr(*Span.begin());
2811  MI.eraseFromParent();
2812  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2813  OpsToWaterfall, MRI);
2814 
2815  if (NeedCopyToVGPR) {
2816  MachineBasicBlock *LoopBB = Extract1->getParent();
2817  Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2818  Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2819  MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2820  MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2821 
2822  Extract0->getOperand(0).setReg(TmpReg0);
2823  Extract1->getOperand(0).setReg(TmpReg1);
2824 
2825  B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2826 
2827  buildVCopy(B, DstRegs[0], TmpReg0);
2828  buildVCopy(B, DstRegs[1], TmpReg1);
2829  }
2830 
2831  if (ShouldMoveIndexIntoLoop)
2832  reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2833 
2834  return;
2835  }
2836  case AMDGPU::G_INSERT_VECTOR_ELT: {
2837  SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2838 
2839  Register DstReg = MI.getOperand(0).getReg();
2840  LLT VecTy = MRI.getType(DstReg);
2841 
2842  assert(OpdMapper.getVRegs(0).empty());
2843  assert(OpdMapper.getVRegs(3).empty());
2844 
2845  if (substituteSimpleCopyRegs(OpdMapper, 1))
2846  MRI.setType(MI.getOperand(1).getReg(), VecTy);
2847 
2848  if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2849  return;
2850 
2851  const RegisterBank *IdxBank =
2852  OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2853 
2854  Register SrcReg = MI.getOperand(1).getReg();
2855  Register InsReg = MI.getOperand(2).getReg();
2856  LLT InsTy = MRI.getType(InsReg);
2857  (void)InsTy;
2858 
2859  Register BaseIdxReg;
2860  unsigned ConstOffset;
2861  std::tie(BaseIdxReg, ConstOffset) =
2862  AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2863 
2864  // See if the index is an add of a constant which will be foldable by moving
2865  // the base register of the index later if this is going to be executed in a
2866  // waterfall loop. This is essentially to reassociate the add of a constant
2867  // with the readfirstlane.
2868  bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2869  ConstOffset > 0 &&
2870  ConstOffset < VecTy.getNumElements();
2871 
2872  // Move the base register. We'll re-insert the add later.
2873  if (ShouldMoveIndexIntoLoop)
2874  MI.getOperand(3).setReg(BaseIdxReg);
2875 
2876 
2877  if (InsRegs.empty()) {
2878  executeInWaterfallLoop(MI, MRI, { 3 });
2879 
2880  // Re-insert the constant offset add inside the waterfall loop.
2881  if (ShouldMoveIndexIntoLoop) {
2883  reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2884  }
2885 
2886  return;
2887  }
2888 
2889 
2890  assert(InsTy.getSizeInBits() == 64);
2891 
2892  const LLT S32 = LLT::scalar(32);
2893  LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2894 
2896  auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2897  auto One = B.buildConstant(S32, 1);
2898 
2899  // Split the vector index into 32-bit pieces. Prepare to move all of the
2900  // new instructions into a waterfall loop if necessary.
2901  //
2902  // Don't put the bitcast or constant in the loop.
2903  MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2904 
2905  // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2906  auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2907  auto IdxHi = B.buildAdd(S32, IdxLo, One);
2908 
2909  auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2910  auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2911 
2912  const RegisterBank *DstBank =
2913  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2914  const RegisterBank *SrcBank =
2915  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2916  const RegisterBank *InsSrcBank =
2917  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2918 
2919  MRI.setRegBank(InsReg, *InsSrcBank);
2920  MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2921  MRI.setRegBank(InsLo.getReg(0), *DstBank);
2922  MRI.setRegBank(InsHi.getReg(0), *DstBank);
2923  MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2924  MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2925  MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2926 
2927 
2928  SmallSet<Register, 4> OpsToWaterfall;
2929  if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2930  B.setInsertPt(B.getMBB(), MI);
2931  B.buildBitcast(DstReg, InsHi);
2932  MI.eraseFromParent();
2933  return;
2934  }
2935 
2936  B.setInstr(*Span.begin());
2937  MI.eraseFromParent();
2938 
2939  // Figure out the point after the waterfall loop before mangling the control
2940  // flow.
2941  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2942  OpsToWaterfall, MRI);
2943 
2944  // The insertion point is now right after the original instruction.
2945  //
2946  // Keep the bitcast to the original vector type out of the loop. Doing this
2947  // saved an extra phi we don't need inside the loop.
2948  B.buildBitcast(DstReg, InsHi);
2949 
2950  // Re-insert the constant offset add inside the waterfall loop.
2951  if (ShouldMoveIndexIntoLoop)
2952  reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2953 
2954  return;
2955  }
2956  case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2957  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2958  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2959  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2960  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2961  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2962  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2963  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2964  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2965  case AMDGPU::G_AMDGPU_BUFFER_STORE:
2966  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2967  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2968  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2969  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2970  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2971  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2972  applyDefaultMapping(OpdMapper);
2973  executeInWaterfallLoop(MI, MRI, {1, 4});
2974  return;
2975  }
2976  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2977  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2978  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2979  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2980  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2981  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2982  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2983  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2984  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2985  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2986  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2987  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2988  applyDefaultMapping(OpdMapper);
2989  executeInWaterfallLoop(MI, MRI, {2, 5});
2990  return;
2991  }
2992  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2993  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2994  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2995  applyDefaultMapping(OpdMapper);
2996  executeInWaterfallLoop(MI, MRI, {2, 5});
2997  return;
2998  }
2999  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3000  applyDefaultMapping(OpdMapper);
3001  executeInWaterfallLoop(MI, MRI, {3, 6});
3002  return;
3003  }
3004  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3005  applyMappingSBufferLoad(OpdMapper);
3006  return;
3007  }
3008  case AMDGPU::G_INTRINSIC: {
3009  switch (MI.getIntrinsicID()) {
3010  case Intrinsic::amdgcn_readlane: {
3011  substituteSimpleCopyRegs(OpdMapper, 2);
3012 
3013  assert(OpdMapper.getVRegs(0).empty());
3014  assert(OpdMapper.getVRegs(3).empty());
3015 
3016  // Make sure the index is an SGPR. It doesn't make sense to run this in a
3017  // waterfall loop, so assume it's a uniform value.
3018  constrainOpWithReadfirstlane(MI, MRI, 3); // Index
3019  return;
3020  }
3021  case Intrinsic::amdgcn_writelane: {
3022  assert(OpdMapper.getVRegs(0).empty());
3023  assert(OpdMapper.getVRegs(2).empty());
3024  assert(OpdMapper.getVRegs(3).empty());
3025 
3026  substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3027  constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
3028  constrainOpWithReadfirstlane(MI, MRI, 3); // Index
3029  return;
3030  }
3031  case Intrinsic::amdgcn_interp_p1:
3032  case Intrinsic::amdgcn_interp_p2:
3033  case Intrinsic::amdgcn_interp_mov:
3034  case Intrinsic::amdgcn_interp_p1_f16:
3035  case Intrinsic::amdgcn_interp_p2_f16: {
3036  applyDefaultMapping(OpdMapper);
3037 
3038  // Readlane for m0 value, which is always the last operand.
3039  // FIXME: Should this be a waterfall loop instead?
3040  constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3041  return;
3042  }
3043  case Intrinsic::amdgcn_permlane16:
3044  case Intrinsic::amdgcn_permlanex16: {
3045  // Doing a waterfall loop over these wouldn't make any sense.
3046  substituteSimpleCopyRegs(OpdMapper, 2);
3047  substituteSimpleCopyRegs(OpdMapper, 3);
3050  return;
3051  }
3052  case Intrinsic::amdgcn_sbfe:
3053  applyMappingBFE(OpdMapper, true);
3054  return;
3055  case Intrinsic::amdgcn_ubfe:
3056  applyMappingBFE(OpdMapper, false);
3057  return;
3058  case Intrinsic::amdgcn_ballot:
3059  // Use default handling and insert copy to vcc source.
3060  break;
3061  }
3062  break;
3063  }
3064  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3065  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3066  const AMDGPU::RsrcIntrinsic *RSrcIntrin
3067  = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3068  assert(RSrcIntrin && RSrcIntrin->IsImage);
3069  // Non-images can have complications from operands that allow both SGPR
3070  // and VGPR. For now it's too complicated to figure out the final opcode
3071  // to derive the register bank from the MCInstrDesc.
3072  applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3073  return;
3074  }
3075  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3076  unsigned N = MI.getNumExplicitOperands() - 2;
3077  applyDefaultMapping(OpdMapper);
3078  executeInWaterfallLoop(MI, MRI, { N });
3079  return;
3080  }
3081  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3082  auto IntrID = MI.getIntrinsicID();
3083  switch (IntrID) {
3084  case Intrinsic::amdgcn_ds_ordered_add:
3085  case Intrinsic::amdgcn_ds_ordered_swap: {
3086  // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3087  assert(OpdMapper.getVRegs(0).empty());
3088  substituteSimpleCopyRegs(OpdMapper, 3);
3089  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3090  return;
3091  }
3092  case Intrinsic::amdgcn_ds_gws_init:
3093  case Intrinsic::amdgcn_ds_gws_barrier:
3094  case Intrinsic::amdgcn_ds_gws_sema_br: {
3095  // Only the first lane is executes, so readfirstlane is safe.
3096  substituteSimpleCopyRegs(OpdMapper, 1);
3097  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3098  return;
3099  }
3100  case Intrinsic::amdgcn_ds_gws_sema_v:
3101  case Intrinsic::amdgcn_ds_gws_sema_p:
3102  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3103  // Only the first lane is executes, so readfirstlane is safe.
3104  constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3105  return;
3106  }
3107  case Intrinsic::amdgcn_ds_append:
3108  case Intrinsic::amdgcn_ds_consume: {
3109  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3110  return;
3111  }
3112  case Intrinsic::amdgcn_s_sendmsg:
3113  case Intrinsic::amdgcn_s_sendmsghalt: {
3114  // FIXME: Should this use a waterfall loop?
3115  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3116  return;
3117  }
3118  case Intrinsic::amdgcn_s_setreg: {
3120  return;
3121  }
3122  default: {
3123  if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3124  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3125  // Non-images can have complications from operands that allow both SGPR
3126  // and VGPR. For now it's too complicated to figure out the final opcode
3127  // to derive the register bank from the MCInstrDesc.
3128  if (RSrcIntrin->IsImage) {
3129  applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3130  return;
3131  }
3132  }
3133 
3134  break;
3135  }
3136  }
3137  break;
3138  }
3139  case AMDGPU::G_LOAD:
3140  case AMDGPU::G_ZEXTLOAD:
3141  case AMDGPU::G_SEXTLOAD: {
3142  if (applyMappingLoad(MI, OpdMapper, MRI))
3143  return;
3144  break;
3145  }
3146  case AMDGPU::G_DYN_STACKALLOC:
3147  applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3148  return;
3149  case AMDGPU::G_SBFX:
3150  applyMappingBFE(OpdMapper, /*Signed*/ true);
3151  return;
3152  case AMDGPU::G_UBFX:
3153  applyMappingBFE(OpdMapper, /*Signed*/ false);
3154  return;
3155  default:
3156  break;
3157  }
3158 
3159  return applyDefaultMapping(OpdMapper);
3160 }
3161 
3162 // vgpr, sgpr -> vgpr
3163 // vgpr, agpr -> vgpr
3164 // agpr, agpr -> agpr
3165 // agpr, sgpr -> vgpr
3166 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3167  if (RB0 == AMDGPU::InvalidRegBankID)
3168  return RB1;
3169  if (RB1 == AMDGPU::InvalidRegBankID)
3170  return RB0;
3171 
3172  if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3173  return AMDGPU::SGPRRegBankID;
3174 
3175  if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3176  return AMDGPU::AGPRRegBankID;
3177 
3178  return AMDGPU::VGPRRegBankID;
3179 }
3180 
3181 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3182  if (RB0 == AMDGPU::InvalidRegBankID)
3183  return RB1;
3184  if (RB1 == AMDGPU::InvalidRegBankID)
3185  return RB0;
3186 
3187  // vcc, vcc -> vcc
3188  // vcc, sgpr -> vcc
3189  // vcc, vgpr -> vcc
3190  if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3191  return AMDGPU::VCCRegBankID;
3192 
3193  // vcc, vgpr -> vgpr
3194  return regBankUnion(RB0, RB1);
3195 }
3196 
3198  const MachineInstr &MI) const {
3199  unsigned RegBank = AMDGPU::InvalidRegBankID;
3200 
3201  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3202  if (!MI.getOperand(i).isReg())
3203  continue;
3204  Register Reg = MI.getOperand(i).getReg();
3205  if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3206  RegBank = regBankUnion(RegBank, Bank->getID());
3207  if (RegBank == AMDGPU::VGPRRegBankID)
3208  break;
3209  }
3210  }
3211 
3212  return RegBank;
3213 }
3214 
3216  const MachineFunction &MF = *MI.getParent()->getParent();
3217  const MachineRegisterInfo &MRI = MF.getRegInfo();
3218  for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3219  if (!MI.getOperand(i).isReg())
3220  continue;
3221  Register Reg = MI.getOperand(i).getReg();
3222  if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3223  if (Bank->getID() != AMDGPU::SGPRRegBankID)
3224  return false;
3225  }
3226  }
3227  return true;
3228 }
3229 
3232  const MachineFunction &MF = *MI.getParent()->getParent();
3233  const MachineRegisterInfo &MRI = MF.getRegInfo();
3234  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3235 
3236  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3237  const MachineOperand &SrcOp = MI.getOperand(i);
3238  if (!SrcOp.isReg())
3239  continue;
3240 
3241  unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3242  OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3243  }
3244  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3245  MI.getNumOperands());
3246 }
3247 
3250  const MachineFunction &MF = *MI.getParent()->getParent();
3251  const MachineRegisterInfo &MRI = MF.getRegInfo();
3252  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3253 
3254  // Even though we technically could use SGPRs, this would require knowledge of
3255  // the constant bus restriction. Force all sources to VGPR (except for VCC).
3256  //
3257  // TODO: Unary ops are trivially OK, so accept SGPRs?
3258  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3259  const MachineOperand &Src = MI.getOperand(i);
3260  if (!Src.isReg())
3261  continue;
3262 
3263  unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3264  unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3265  OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3266  }
3267 
3268  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3269  MI.getNumOperands());
3270 }
3271 
3274  const MachineFunction &MF = *MI.getParent()->getParent();
3275  const MachineRegisterInfo &MRI = MF.getRegInfo();
3276  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3277 
3278  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3279  const MachineOperand &Op = MI.getOperand(I);
3280  if (!Op.isReg())
3281  continue;
3282 
3283  unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3284  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3285  }
3286 
3287  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3288  MI.getNumOperands());
3289 }
3290 
3293  const MachineInstr &MI,
3294  int RsrcIdx) const {
3295  // The reported argument index is relative to the IR intrinsic call arguments,
3296  // so we need to shift by the number of defs and the intrinsic ID.
3297  RsrcIdx += MI.getNumExplicitDefs() + 1;
3298 
3299  const int NumOps = MI.getNumOperands();
3300  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3301 
3302  // TODO: Should packed/unpacked D16 difference be reported here as part of
3303  // the value mapping?
3304  for (int I = 0; I != NumOps; ++I) {
3305  if (!MI.getOperand(I).isReg())
3306  continue;
3307 
3308  Register OpReg = MI.getOperand(I).getReg();
3309  // We replace some dead address operands with $noreg
3310  if (!OpReg)
3311  continue;
3312 
3313  unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3314 
3315  // FIXME: Probably need a new intrinsic register bank searchable table to
3316  // handle arbitrary intrinsics easily.
3317  //
3318  // If this has a sampler, it immediately follows rsrc.
3319  const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3320 
3321  if (MustBeSGPR) {
3322  // If this must be an SGPR, so we must report whatever it is as legal.
3323  unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3324  OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3325  } else {
3326  // Some operands must be VGPR, and these are easy to copy to.
3327  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3328  }
3329  }
3330 
3331  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3332 }
3333 
3334 /// Return the mapping for a pointer arugment.
3337  Register PtrReg) const {
3338  LLT PtrTy = MRI.getType(PtrReg);
3339  unsigned Size = PtrTy.getSizeInBits();
3340  if (Subtarget.useFlatForGlobal() ||
3342  return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3343 
3344  // If we're using MUBUF instructions for global memory, an SGPR base register
3345  // is possible. Otherwise this needs to be a VGPR.
3346  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3347  return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3348 }
3349 
3352 
3353  const MachineFunction &MF = *MI.getParent()->getParent();
3354  const MachineRegisterInfo &MRI = MF.getRegInfo();
3355  SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3356  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3357  Register PtrReg = MI.getOperand(1).getReg();
3358  LLT PtrTy = MRI.getType(PtrReg);
3359  unsigned AS = PtrTy.getAddressSpace();
3360  unsigned PtrSize = PtrTy.getSizeInBits();
3361 
3362  const ValueMapping *ValMapping;
3363  const ValueMapping *PtrMapping;
3364 
3365  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3366 
3367  if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3368  if (isScalarLoadLegal(MI)) {
3369  // We have a uniform instruction so we want to use an SMRD load
3370  ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3371  PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3372  } else {
3373  ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3374 
3375  // If we're using MUBUF instructions for global memory, an SGPR base
3376  // register is possible. Otherwise this needs to be a VGPR.
3377  unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3378  AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3379 
3380  PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3381  }
3382  } else {
3383  ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3384  PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3385  }
3386 
3387  OpdsMapping[0] = ValMapping;
3388  OpdsMapping[1] = PtrMapping;
3390  1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3391  return Mapping;
3392 
3393  // FIXME: Do we want to add a mapping for FLAT load, or should we just
3394  // handle that during instruction selection?
3395 }
3396 
3397 unsigned
3399  const MachineRegisterInfo &MRI,
3400  unsigned Default) const {
3401  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3402  return Bank ? Bank->getID() : Default;
3403 }
3404 
3407  const MachineRegisterInfo &MRI,
3408  const TargetRegisterInfo &TRI) const {
3409  // Lie and claim anything is legal, even though this needs to be an SGPR
3410  // applyMapping will have to deal with it as a waterfall loop.
3411  unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3412  unsigned Size = getSizeInBits(Reg, MRI, TRI);
3413  return AMDGPU::getValueMapping(Bank, Size);
3414 }
3415 
3418  const MachineRegisterInfo &MRI,
3419  const TargetRegisterInfo &TRI) const {
3420  unsigned Size = getSizeInBits(Reg, MRI, TRI);
3421  return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3422 }
3423 
3426  const MachineRegisterInfo &MRI,
3427  const TargetRegisterInfo &TRI) const {
3428  unsigned Size = getSizeInBits(Reg, MRI, TRI);
3429  return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3430 }
3431 
3432 ///
3433 /// This function must return a legal mapping, because
3434 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3435 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3436 /// VGPR to SGPR generated is illegal.
3437 ///
3438 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3439 // legal. These will be dealt with in applyMappingImpl.
3440 //
3443  const MachineFunction &MF = *MI.getParent()->getParent();
3444  const MachineRegisterInfo &MRI = MF.getRegInfo();
3445 
3446  if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3447  // The default logic bothers to analyze impossible alternative mappings. We
3448  // want the most straightforward mapping, so just directly handle this.
3449  const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3450  *TRI);
3451  const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3452  *TRI);
3453  assert(SrcBank && "src bank should have been assigned already");
3454  if (!DstBank)
3455  DstBank = SrcBank;
3456 
3457  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3458  if (cannotCopy(*DstBank, *SrcBank, Size))
3460 
3461  const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3462  unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3463  SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3464  OpdsMapping[0] = &ValMap;
3465  if (MI.getOpcode() == AMDGPU::G_FREEZE)
3466  OpdsMapping[1] = &ValMap;
3467 
3468  return getInstructionMapping(
3469  1, /*Cost*/ 1,
3470  /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3471  }
3472 
3473  if (MI.isRegSequence()) {
3474  // If any input is a VGPR, the result must be a VGPR. The default handling
3475  // assumes any copy between banks is legal.
3476  unsigned BankID = AMDGPU::SGPRRegBankID;
3477 
3478  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3479  auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3480  // It doesn't make sense to use vcc or scc banks here, so just ignore
3481  // them.
3482  if (OpBank != AMDGPU::SGPRRegBankID) {
3483  BankID = AMDGPU::VGPRRegBankID;
3484  break;
3485  }
3486  }
3487  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3488 
3489  const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3490  return getInstructionMapping(
3491  1, /*Cost*/ 1,
3492  /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3493  }
3494 
3495  // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3496  // properly.
3497  //
3498  // TODO: There are additional exec masking dependencies to analyze.
3499  if (MI.getOpcode() == TargetOpcode::G_PHI) {
3500  unsigned ResultBank = AMDGPU::InvalidRegBankID;
3501  Register DstReg = MI.getOperand(0).getReg();
3502 
3503  // Sometimes the result may have already been assigned a bank.
3504  if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3505  ResultBank = DstBank->getID();
3506 
3507  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3508  Register Reg = MI.getOperand(I).getReg();
3509  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3510 
3511  // FIXME: Assuming VGPR for any undetermined inputs.
3512  if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3513  ResultBank = AMDGPU::VGPRRegBankID;
3514  break;
3515  }
3516 
3517  // FIXME: Need to promote SGPR case to s32
3518  unsigned OpBank = Bank->getID();
3519  ResultBank = regBankBoolUnion(ResultBank, OpBank);
3520  }
3521 
3522  assert(ResultBank != AMDGPU::InvalidRegBankID);
3523 
3524  unsigned Size = MRI.getType(DstReg).getSizeInBits();
3525 
3526  const ValueMapping &ValMap =
3527  getValueMapping(0, Size, getRegBank(ResultBank));
3528  return getInstructionMapping(
3529  1, /*Cost*/ 1,
3530  /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3531  }
3532 
3534  if (Mapping.isValid())
3535  return Mapping;
3536 
3537  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3538 
3539  switch (MI.getOpcode()) {
3540  default:
3542 
3543  case AMDGPU::G_AND:
3544  case AMDGPU::G_OR:
3545  case AMDGPU::G_XOR: {
3546  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3547  if (Size == 1) {
3548  const RegisterBank *DstBank
3549  = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3550 
3551  unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3552  unsigned BankLHS = AMDGPU::InvalidRegBankID;
3553  unsigned BankRHS = AMDGPU::InvalidRegBankID;
3554  if (DstBank) {
3555  TargetBankID = DstBank->getID();
3556  if (DstBank == &AMDGPU::VCCRegBank) {
3557  TargetBankID = AMDGPU::VCCRegBankID;
3558  BankLHS = AMDGPU::VCCRegBankID;
3559  BankRHS = AMDGPU::VCCRegBankID;
3560  } else {
3561  BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3562  AMDGPU::SGPRRegBankID);
3563  BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3564  AMDGPU::SGPRRegBankID);
3565  }
3566  } else {
3567  BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3568  AMDGPU::VCCRegBankID);
3569  BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3570  AMDGPU::VCCRegBankID);
3571 
3572  // Both inputs should be true booleans to produce a boolean result.
3573  if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3574  TargetBankID = AMDGPU::VGPRRegBankID;
3575  } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3576  TargetBankID = AMDGPU::VCCRegBankID;
3577  BankLHS = AMDGPU::VCCRegBankID;
3578  BankRHS = AMDGPU::VCCRegBankID;
3579  } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3580  TargetBankID = AMDGPU::SGPRRegBankID;
3581  }
3582  }
3583 
3584  OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3585  OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3586  OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3587  break;
3588  }
3589 
3590  if (Size == 64) {
3591 
3592  if (isSALUMapping(MI)) {
3593  OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3594  OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3595  } else {
3596  OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3597  unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3598  OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3599 
3600  unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3601  OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3602  }
3603 
3604  break;
3605  }
3606 
3608  }
3609  case AMDGPU::G_PTR_ADD:
3610  case AMDGPU::G_PTRMASK:
3611  case AMDGPU::G_ADD:
3612  case AMDGPU::G_SUB:
3613  case AMDGPU::G_MUL:
3614  case AMDGPU::G_SHL:
3615  case AMDGPU::G_LSHR:
3616  case AMDGPU::G_ASHR:
3617  case AMDGPU::G_UADDO:
3618  case AMDGPU::G_USUBO:
3619  case AMDGPU::G_UADDE:
3620  case AMDGPU::G_SADDE:
3621  case AMDGPU::G_USUBE:
3622  case AMDGPU::G_SSUBE:
3623  case AMDGPU::G_SMIN:
3624  case AMDGPU::G_SMAX:
3625  case AMDGPU::G_UMIN:
3626  case AMDGPU::G_UMAX:
3627  case AMDGPU::G_ABS:
3628  case AMDGPU::G_SHUFFLE_VECTOR:
3629  case AMDGPU::G_SBFX:
3630  case AMDGPU::G_UBFX:
3631  if (isSALUMapping(MI))
3632  return getDefaultMappingSOP(MI);
3634 
3635  case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3636  case AMDGPU::G_SSUBSAT:
3637  case AMDGPU::G_UADDSAT:
3638  case AMDGPU::G_USUBSAT:
3639  case AMDGPU::G_FADD:
3640  case AMDGPU::G_FSUB:
3641  case AMDGPU::G_FPTOSI:
3642  case AMDGPU::G_FPTOUI:
3643  case AMDGPU::G_FMUL:
3644  case AMDGPU::G_FMA:
3645  case AMDGPU::G_FMAD:
3646  case AMDGPU::G_FSQRT:
3647  case AMDGPU::G_FFLOOR:
3648  case AMDGPU::G_FCEIL:
3649  case AMDGPU::G_FRINT:
3650  case AMDGPU::G_SITOFP:
3651  case AMDGPU::G_UITOFP:
3652  case AMDGPU::G_FPTRUNC:
3653  case AMDGPU::G_FPEXT:
3654  case AMDGPU::G_FEXP2:
3655  case AMDGPU::G_FLOG2:
3656  case AMDGPU::G_FMINNUM:
3657  case AMDGPU::G_FMAXNUM:
3658  case AMDGPU::G_FMINNUM_IEEE:
3659  case AMDGPU::G_FMAXNUM_IEEE:
3660  case AMDGPU::G_FCANONICALIZE:
3661  case AMDGPU::G_INTRINSIC_TRUNC:
3662  case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3663  case AMDGPU::G_FSHR: // TODO: Expand for scalar
3664  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3665  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3666  case AMDGPU::G_AMDGPU_RCP_IFLAG:
3667  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3668  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3669  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3670  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3671  case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3672  case AMDGPU::G_AMDGPU_SMED3:
3673  return getDefaultMappingVOP(MI);
3674  case AMDGPU::G_UMULH:
3675  case AMDGPU::G_SMULH: {
3677  return getDefaultMappingSOP(MI);
3678  return getDefaultMappingVOP(MI);
3679  }
3680  case AMDGPU::G_IMPLICIT_DEF: {
3681  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3682  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3683  break;
3684  }
3685  case AMDGPU::G_FCONSTANT:
3686  case AMDGPU::G_CONSTANT:
3687  case AMDGPU::G_GLOBAL_VALUE:
3688  case AMDGPU::G_BLOCK_ADDR:
3689  case AMDGPU::G_READCYCLECOUNTER: {
3690  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3691  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3692  break;
3693  }
3694  case AMDGPU::G_FRAME_INDEX: {
3695  // TODO: This should be the same as other constants, but eliminateFrameIndex
3696  // currently assumes VALU uses.
3697  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3698  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3699  break;
3700  }
3701  case AMDGPU::G_DYN_STACKALLOC: {
3702  // Result is always uniform, and a wave reduction is needed for the source.
3703  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3704  unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3705  OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3706  break;
3707  }
3708  case AMDGPU::G_INSERT: {
3709  unsigned BankID = getMappingType(MRI, MI);
3710  unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3711  unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3712  unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3713  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3714  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3715  OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3716  OpdsMapping[3] = nullptr;
3717  break;
3718  }
3719  case AMDGPU::G_EXTRACT: {
3720  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3721  unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3722  unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3723  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3724  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3725  OpdsMapping[2] = nullptr;
3726  break;
3727  }
3728  case AMDGPU::G_BUILD_VECTOR:
3729  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3730  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3731  if (DstTy == LLT::fixed_vector(2, 16)) {
3732  unsigned DstSize = DstTy.getSizeInBits();
3733  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3734  unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3735  unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3736  unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3737 
3738  OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3739  OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3740  OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3741  break;
3742  }
3743 
3745  }
3746  case AMDGPU::G_MERGE_VALUES:
3747  case AMDGPU::G_CONCAT_VECTORS: {
3748  unsigned Bank = getMappingType(MRI, MI);
3749  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3750  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3751 
3752  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3753  // Op1 and Dst should use the same register bank.
3754  for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3755  OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3756  break;
3757  }
3758  case AMDGPU::G_BITREVERSE:
3759  case AMDGPU::G_BITCAST:
3760  case AMDGPU::G_INTTOPTR:
3761  case AMDGPU::G_PTRTOINT:
3762  case AMDGPU::G_FABS:
3763  case AMDGPU::G_FNEG: {
3764  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3765  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3766  OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3767  break;
3768  }
3769  case AMDGPU::G_AMDGPU_FFBH_U32:
3770  case AMDGPU::G_AMDGPU_FFBL_B32:
3771  case AMDGPU::G_CTLZ_ZERO_UNDEF:
3772  case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3773  unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3774  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3775  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3776  OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3777  break;
3778  }
3779  case AMDGPU::G_CTPOP: {
3780  unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3781  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3782  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3783 
3784  // This should really be getValueMappingSGPR64Only, but allowing the generic
3785  // code to handle the register split just makes using LegalizerHelper more
3786  // difficult.
3787  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3788  break;
3789  }
3790  case AMDGPU::G_TRUNC: {
3791  Register Dst = MI.getOperand(0).getReg();
3792  Register Src = MI.getOperand(1).getReg();
3793  unsigned Bank = getRegBankID(Src, MRI);
3794  unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3795  unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3796  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3797  OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3798  break;
3799  }
3800  case AMDGPU::G_ZEXT:
3801  case AMDGPU::G_SEXT:
3802  case AMDGPU::G_ANYEXT:
3803  case AMDGPU::G_SEXT_INREG: {
3804  Register Dst = MI.getOperand(0).getReg();
3805  Register Src = MI.getOperand(1).getReg();
3806  unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3807  unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3808 
3809  unsigned DstBank;
3810  const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3811  assert(SrcBank);
3812  switch (SrcBank->getID()) {
3813  case AMDGPU::SGPRRegBankID:
3814  DstBank = AMDGPU::SGPRRegBankID;
3815  break;
3816  default:
3817  DstBank = AMDGPU::VGPRRegBankID;
3818  break;
3819  }
3820 
3821  // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3822  // 32-bits, and then to 64.
3823  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3824  OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3825  SrcSize);
3826  break;
3827  }
3828  case AMDGPU::G_FCMP: {
3829  unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3830  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3831  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3832  OpdsMapping[1] = nullptr; // Predicate Operand.
3833  OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3834  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3835  break;
3836  }
3837  case AMDGPU::G_STORE: {
3838  assert(MI.getOperand(0).isReg());
3839  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3840 
3841  // FIXME: We need to specify a different reg bank once scalar stores are
3842  // supported.
3843  const ValueMapping *ValMapping =
3844  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3845  OpdsMapping[0] = ValMapping;
3846  OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3847  break;
3848  }
3849  case AMDGPU::G_ICMP: {
3850  auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3851  unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3852 
3853  // See if the result register has already been constrained to vcc, which may
3854  // happen due to control flow intrinsic lowering.
3855  unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3856  AMDGPU::SGPRRegBankID);
3857  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3858  unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3859 
3860  bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3861  Op2Bank == AMDGPU::SGPRRegBankID &&
3862  Op3Bank == AMDGPU::SGPRRegBankID &&
3863  (Size == 32 || (Size == 64 &&
3864  (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3866 
3867  DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3868  unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3869 
3870  // TODO: Use 32-bit for scalar output size.
3871  // SCC results will need to be copied to a 32-bit SGPR virtual register.
3872  const unsigned ResultSize = 1;
3873 
3874  OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3875  OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3876  OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3877  break;
3878  }
3879  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3880  // VGPR index can be used for waterfall when indexing a SGPR vector.
3881  unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3882  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3883  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3884  unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3885  unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3886  unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3887 
3888  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3889  OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3890 
3891  // The index can be either if the source vector is VGPR.
3892  OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3893  break;
3894  }
3895  case AMDGPU::G_INSERT_VECTOR_ELT: {
3896  unsigned OutputBankID = isSALUMapping(MI) ?
3897  AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3898 
3899  unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3900  unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3901  unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3902  unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3903  unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3904 
3905  OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3906  OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3907 
3908  // This is a weird case, because we need to break down the mapping based on
3909  // the register bank of a different operand.
3910  if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3911  OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3912  InsertSize);
3913  } else {
3914  assert(InsertSize == 32 || InsertSize == 64);
3915  OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3916  }
3917 
3918  // The index can be either if the source vector is VGPR.
3919  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3920  break;
3921  }
3922  case AMDGPU::G_UNMERGE_VALUES: {
3923  unsigned Bank = getMappingType(MRI, MI);
3924 
3925  // Op1 and Dst should use the same register bank.
3926  // FIXME: Shouldn't this be the default? Why do we need to handle this?
3927  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3928  unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3929  OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3930  }
3931  break;
3932  }
3933  case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3934  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3935  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3936  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3937  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3938  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3939  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3940  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3941  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3942  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3943  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3944  case AMDGPU::G_AMDGPU_BUFFER_STORE:
3945  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3946  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3947  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3948  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3949  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3950 
3951  // rsrc
3952  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3953 
3954  // vindex
3955  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3956 
3957  // voffset
3958  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3959 
3960  // soffset
3961  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3962 
3963  // Any remaining operands are immediates and were correctly null
3964  // initialized.
3965  break;
3966  }
3967  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3968  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3969  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3970  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3971  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3972  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3973  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3974  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3975  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3976  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3977  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3978  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3979  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3980  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3981  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3982  // vdata_out
3983  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3984 
3985  // vdata_in
3986  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3987 
3988  // rsrc
3989  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3990 
3991  // vindex
3992  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3993 
3994  // voffset
3995  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3996 
3997  // soffset
3998  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3999 
4000  // Any remaining operands are immediates and were correctly null
4001  // initialized.
4002  break;
4003  }
4004  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4005  // vdata_out
4006  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4007 
4008  // vdata_in
4009  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4010 
4011  // cmp
4012  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4013 
4014  // rsrc
4015  OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4016 
4017  // vindex
4018  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4019 
4020  // voffset
4021  OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4022 
4023  // soffset
4024  OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4025 
4026  // Any remaining operands are immediates and were correctly null
4027  // initialized.
4028  break;
4029  }
4030  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4031  // Lie and claim everything is legal, even though some need to be
4032  // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4033  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4034  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4035 
4036  // We need to convert this to a MUBUF if either the resource of offset is
4037  // VGPR.
4038  unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4039  unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4040  unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4041 
4042  unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4043  OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4044  break;
4045  }
4046  case AMDGPU::G_INTRINSIC: {
4047  switch (MI.getIntrinsicID()) {
4048  default:
4050  case Intrinsic::amdgcn_div_fmas:
4051  case Intrinsic::amdgcn_div_fixup:
4052  case Intrinsic::amdgcn_trig_preop:
4053  case Intrinsic::amdgcn_sin:
4054  case Intrinsic::amdgcn_cos:
4055  case Intrinsic::amdgcn_log_clamp:
4056  case Intrinsic::amdgcn_rcp:
4057  case Intrinsic::amdgcn_rcp_legacy:
4058  case Intrinsic::amdgcn_sqrt:
4059  case Intrinsic::amdgcn_rsq:
4060  case Intrinsic::amdgcn_rsq_legacy:
4061  case Intrinsic::amdgcn_rsq_clamp:
4062  case Intrinsic::amdgcn_fmul_legacy:
4063  case Intrinsic::amdgcn_fma_legacy:
4064  case Intrinsic::amdgcn_ldexp:
4065  case Intrinsic::amdgcn_frexp_mant:
4066  case Intrinsic::amdgcn_frexp_exp:
4067  case Intrinsic::amdgcn_fract:
4068  case Intrinsic::amdgcn_cvt_pkrtz:
4069  case Intrinsic::amdgcn_cvt_pknorm_i16:
4070  case Intrinsic::amdgcn_cvt_pknorm_u16:
4071  case Intrinsic::amdgcn_cvt_pk_i16:
4072  case Intrinsic::amdgcn_cvt_pk_u16:
4073  case Intrinsic::amdgcn_fmed3:
4074  case Intrinsic::amdgcn_cubeid:
4075  case Intrinsic::amdgcn_cubema:
4076  case Intrinsic::amdgcn_cubesc:
4077  case Intrinsic::amdgcn_cubetc:
4078  case Intrinsic::amdgcn_sffbh:
4079  case Intrinsic::amdgcn_fmad_ftz:
4080  case Intrinsic::amdgcn_mbcnt_lo:
4081  case Intrinsic::amdgcn_mbcnt_hi:
4082  case Intrinsic::amdgcn_mul_u24:
4083  case Intrinsic::amdgcn_mul_i24:
4084  case Intrinsic::amdgcn_lerp:
4085  case Intrinsic::amdgcn_sad_u8:
4086  case Intrinsic::amdgcn_msad_u8:
4087  case Intrinsic::amdgcn_sad_hi_u8:
4088  case Intrinsic::amdgcn_sad_u16:
4089  case Intrinsic::amdgcn_qsad_pk_u16_u8:
4090  case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4091  case Intrinsic::amdgcn_mqsad_u32_u8:
4092  case Intrinsic::amdgcn_cvt_pk_u8_f32:
4093  case Intrinsic::amdgcn_alignbit:
4094  case Intrinsic::amdgcn_alignbyte:
4095  case Intrinsic::amdgcn_perm:
4096  case Intrinsic::amdgcn_fdot2:
4097  case Intrinsic::amdgcn_sdot2:
4098  case Intrinsic::amdgcn_udot2:
4099  case Intrinsic::amdgcn_sdot4:
4100  case Intrinsic::amdgcn_udot4:
4101  case Intrinsic::amdgcn_sdot8:
4102  case Intrinsic::amdgcn_udot8:
4103  return getDefaultMappingVOP(MI);
4104  case Intrinsic::amdgcn_sbfe:
4105  case Intrinsic::amdgcn_ubfe:
4106  if (isSALUMapping(MI))
4107  return getDefaultMappingSOP(MI);
4108  return getDefaultMappingVOP(MI);
4109  case Intrinsic::amdgcn_ds_swizzle:
4110  case Intrinsic::amdgcn_ds_permute:
4111  case Intrinsic::amdgcn_ds_bpermute:
4112  case Intrinsic::amdgcn_update_dpp:
4113  case Intrinsic::amdgcn_mov_dpp8:
4114  case Intrinsic::amdgcn_mov_dpp:
4115  case Intrinsic::amdgcn_strict_wwm:
4116  case Intrinsic::amdgcn_wwm:
4117  case Intrinsic::amdgcn_strict_wqm:
4118  case Intrinsic::amdgcn_wqm:
4119  case Intrinsic::amdgcn_softwqm:
4120  case Intrinsic::amdgcn_set_inactive:
4121  return getDefaultMappingAllVGPR(MI);
4122  case Intrinsic::amdgcn_kernarg_segment_ptr:
4123  case Intrinsic::amdgcn_s_getpc:
4124  case Intrinsic::amdgcn_groupstaticsize:
4125  case Intrinsic::amdgcn_reloc_constant:
4126  case Intrinsic::returnaddress: {
4127  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4128  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4129  break;
4130  }
4131  case Intrinsic::amdgcn_wqm_vote: {
4132  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4133  OpdsMapping[0] = OpdsMapping[2]
4134  = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4135  break;
4136  }
4137  case Intrinsic::amdgcn_ps_live: {
4138  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4139  break;
4140  }
4141  case Intrinsic::amdgcn_div_scale: {
4142  unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4143  unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4144  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4145  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4146 
4147  unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4148  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4149  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4150  break;
4151  }
4152  case Intrinsic::amdgcn_class: {
4153  Register Src0Reg = MI.getOperand(2).getReg();
4154  Register Src1Reg = MI.getOperand(3).getReg();
4155  unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4156  unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4157  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4158  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4159  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4160  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4161  break;
4162  }
4163  case Intrinsic::amdgcn_icmp:
4164  case Intrinsic::amdgcn_fcmp: {
4165  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4166  // This is not VCCRegBank because this is not used in boolean contexts.
4167  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4168  unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4169  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4170  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4171  break;
4172  }
4173  case Intrinsic::amdgcn_readlane: {
4174  // This must be an SGPR, but accept a VGPR.
4175  Register IdxReg = MI.getOperand(3).getReg();
4176  unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4177  unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4178  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4180  }
4181  case Intrinsic::amdgcn_readfirstlane: {
4182  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4183  unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4184  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4185  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4186  break;
4187  }
4188  case Intrinsic::amdgcn_writelane: {
4189  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4190  Register SrcReg = MI.getOperand(2).getReg();
4191  unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4192  unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4193  Register IdxReg = MI.getOperand(3).getReg();
4194  unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4195  unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4196  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4197 
4198  // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4199  // to legalize.
4200  OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4201  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4202  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4203  break;
4204  }
4205  case Intrinsic::amdgcn_if_break: {
4206  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4207  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4208  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4209  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4210  break;
4211  }
4212  case Intrinsic::amdgcn_permlane16:
4213  case Intrinsic::amdgcn_permlanex16: {
4214  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4215  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4216  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4217  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4218  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4219  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4220  break;
4221  }
4222  case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4223  case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4224  case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4225  case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4226  case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4227  case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4228  case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4229  case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4230  case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4231  case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4232  case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4233  case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4234  case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4235  case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4236  case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4237  case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4238  case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4239  case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4240  case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4241  case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4242  case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4243  case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4244  case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4245  case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4246  case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4247  case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4248  case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4249  // Default for MAI intrinsics.
4250  // srcC can also be an immediate which can be folded later.
4251  // FIXME: Should we eventually add an alternative mapping with AGPR src
4252  // for srcA/srcB?
4253  //
4254  // vdst, srcA, srcB, srcC
4255  OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4256  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4257  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4258  OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4259  break;
4260  }
4261  case Intrinsic::amdgcn_interp_p1:
4262  case Intrinsic::amdgcn_interp_p2:
4263  case Intrinsic::amdgcn_interp_mov:
4264  case Intrinsic::amdgcn_interp_p1_f16:
4265  case Intrinsic::amdgcn_interp_p2_f16: {
4266  const int M0Idx = MI.getNumOperands() - 1;
4267  Register M0Reg = MI.getOperand(M0Idx).getReg();
4268  unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4269  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4270 
4271  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4272  for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4273  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4274 
4275  // Must be SGPR, but we must take whatever the original bank is and fix it
4276  // later.
4277  OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4278  break;
4279  }
4280  case Intrinsic::amdgcn_ballot: {
4281  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4282  unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4283  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4284  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4285  break;
4286  }
4287  }
4288  break;
4289  }
4290  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4291  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4292  auto IntrID = MI.getIntrinsicID();
4293  const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4294  assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4295  // Non-images can have complications from operands that allow both SGPR
4296  // and VGPR. For now it's too complicated to figure out the final opcode
4297  // to derive the register bank from the MCInstrDesc.
4298  assert(RSrcIntrin->IsImage);
4299  return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4300  }
4301  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4302  unsigned N = MI.getNumExplicitOperands() - 2;
4303  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4304  OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4305  if (N == 3) {
4306  // Sequential form: all operands combined into VGPR256/VGPR512
4307  unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4308  if (Size > 256)
4309  Size = 512;
4310  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4311  } else {
4312  // NSA form
4313  for (unsigned I = 2; I < N; ++I)
4314  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4315  }
4316  break;
4317  }
4318  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4319  auto IntrID = MI.getIntrinsicID();
4320  switch (IntrID) {
4321  case Intrinsic::amdgcn_s_getreg:
4322  case Intrinsic::amdgcn_s_memtime:
4323  case Intrinsic::amdgcn_s_memrealtime:
4324  case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4325  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4326  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4327  break;
4328  }
4329  case Intrinsic::amdgcn_global_atomic_fadd:
4330  case Intrinsic::amdgcn_global_atomic_csub:
4331  case Intrinsic::amdgcn_global_atomic_fmin:
4332  case Intrinsic::amdgcn_global_atomic_fmax:
4333  case Intrinsic::amdgcn_flat_atomic_fadd:
4334  case Intrinsic::amdgcn_flat_atomic_fmin:
4335  case Intrinsic::amdgcn_flat_atomic_fmax:
4336  return getDefaultMappingAllVGPR(MI);
4337  case Intrinsic::amdgcn_ds_ordered_add:
4338  case Intrinsic::amdgcn_ds_ordered_swap: {
4339  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4340  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4341  unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4342  AMDGPU::SGPRRegBankID);
4343  OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4344  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4345  break;
4346  }
4347  case Intrinsic::amdgcn_ds_append:
4348  case Intrinsic::amdgcn_ds_consume: {
4349  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4350  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4351  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4352  break;
4353  }
4354  case Intrinsic::amdgcn_exp_compr:
4355  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4356  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4357  break;
4358  case Intrinsic::amdgcn_exp:
4359  // FIXME: Could we support packed types here?
4360  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4361  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4362  OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4363  OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4364  break;
4365  case Intrinsic::amdgcn_s_sendmsg:
4366  case Intrinsic::amdgcn_s_sendmsghalt: {
4367  // This must be an SGPR, but accept a VGPR.
4368  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4369  AMDGPU::SGPRRegBankID);
4370  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4371  break;
4372  }
4373  case Intrinsic::amdgcn_s_setreg: {
4374  // This must be an SGPR, but accept a VGPR.
4375  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4376  AMDGPU::SGPRRegBankID);
4377  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4378  break;
4379  }
4380  case Intrinsic::amdgcn_end_cf: {
4381  unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4382  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4383  break;
4384  }
4385  case Intrinsic::amdgcn_else: {
4386  unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4387  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4388  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4389  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4390  break;
4391  }
4392  case Intrinsic::amdgcn_live_mask: {
4393  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4394  break;
4395  }
4396  case Intrinsic::amdgcn_wqm_demote:
4397  case Intrinsic::amdgcn_kill: {
4398  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4399  break;
4400  }
4401  case Intrinsic::amdgcn_raw_buffer_load:
4402  case Intrinsic::amdgcn_raw_tbuffer_load: {
4403  // FIXME: Should make intrinsic ID the last operand of the instruction,
4404  // then this would be the same as store
4405  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4406  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4407  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4408  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4409  break;
4410  }
4411  case Intrinsic::amdgcn_raw_buffer_store:
4412  case Intrinsic::amdgcn_raw_buffer_store_format:
4413  case Intrinsic::amdgcn_raw_tbuffer_store: {
4414  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4415  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4416  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4417  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4418  break;
4419  }
4420  case Intrinsic::amdgcn_struct_buffer_load:
4421  case Intrinsic::amdgcn_struct_tbuffer_load: {
4422  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4423  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4424  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4425  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4426  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4427  break;
4428  }
4429  case Intrinsic::amdgcn_struct_buffer_store:
4430  case Intrinsic::amdgcn_struct_tbuffer_store: {
4431  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4432  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4433  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4434  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4435  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4436  break;
4437  }
4438  case Intrinsic::amdgcn_init_exec_from_input: {
4439  unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4440  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4441  break;
4442  }
4443  case Intrinsic::amdgcn_ds_gws_init:
4444  case Intrinsic::amdgcn_ds_gws_barrier:
4445  case Intrinsic::amdgcn_ds_gws_sema_br: {
4446  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4447 
4448  // This must be an SGPR, but accept a VGPR.
4449  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4450  AMDGPU::SGPRRegBankID);
4451  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4452  break;
4453  }
4454  case Intrinsic::amdgcn_ds_gws_sema_v:
4455  case Intrinsic::amdgcn_ds_gws_sema_p:
4456  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4457  // This must be an SGPR, but accept a VGPR.
4458  unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4459  AMDGPU::SGPRRegBankID);
4460  OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4461  break;
4462  }
4463  default:
4465  }
4466  break;
4467  }
4468  case AMDGPU::G_SELECT: {
4469  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4470  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4471  AMDGPU::SGPRRegBankID);
4472  unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4473  AMDGPU::SGPRRegBankID);
4474  bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4475  Op3Bank == AMDGPU::SGPRRegBankID;
4476 
4477  unsigned CondBankDefault = SGPRSrcs ?
4478  AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4479  unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4480  CondBankDefault);
4481  if (CondBank == AMDGPU::SGPRRegBankID)
4482  CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4483  else if (CondBank == AMDGPU::VGPRRegBankID)
4484  CondBank = AMDGPU::VCCRegBankID;
4485 
4486  unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4487  AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4488 
4489  assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4490 
4491  // TODO: Should report 32-bit for scalar condition type.
4492  if (Size == 64) {
4493  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4494  OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4495  OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4496  OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4497  } else {
4498  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4499  OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4500  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4501  OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4502  }
4503 
4504  break;
4505  }
4506 
4507  case AMDGPU::G_LOAD:
4508  case AMDGPU::G_ZEXTLOAD:
4509  case AMDGPU::G_SEXTLOAD:
4510  return getInstrMappingForLoad(MI);
4511 
4512  case AMDGPU::G_ATOMICRMW_XCHG:
4513  case AMDGPU::G_ATOMICRMW_ADD:
4514  case AMDGPU::G_ATOMICRMW_SUB:
4515  case AMDGPU::G_ATOMICRMW_AND:
4516  case AMDGPU::G_ATOMICRMW_OR:
4517  case AMDGPU::G_ATOMICRMW_XOR:
4518  case AMDGPU::G_ATOMICRMW_MAX:
4519  case AMDGPU::G_ATOMICRMW_MIN:
4520  case AMDGPU::G_ATOMICRMW_UMAX:
4521  case AMDGPU::G_ATOMICRMW_UMIN:
4522  case AMDGPU::G_ATOMICRMW_FADD:
4523  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4524  case AMDGPU::G_AMDGPU_ATOMIC_INC:
4525  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4526  case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4527  case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4528  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4529  OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4530  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4531  break;
4532  }
4533  case AMDGPU::G_ATOMIC_CMPXCHG: {
4534  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4535  OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4536  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4537  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4538  break;
4539  }
4540  case AMDGPU::G_BRCOND: {
4541  unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4542  AMDGPU::SGPRRegBankID);
4543  assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4544  if (Bank != AMDGPU::SGPRRegBankID)
4545  Bank = AMDGPU::VCCRegBankID;
4546 
4547  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4548  break;
4549  }
4550  }
4551 
4552  return getInstructionMapping(/*ID*/1, /*Cost*/1,
4553  getOperandsMapping(OpdsMapping),
4554  MI.getNumOperands());
4555 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
MIPatternMatch.h
llvm::GCNSubtarget::hasScalarMulHiInsts
bool hasScalarMulHiInsts() const
Definition: GCNSubtarget.h:386
isScalarLoadLegal
static bool isScalarLoadLegal(const MachineInstr &MI)
Definition: AMDGPURegisterBankInfo.cpp:438
llvm::getIConstantVRegSExtVal
Optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:282
llvm::AMDGPURegisterBankInfo::getSGPROpMapping
const ValueMapping * getSGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3406
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4636
substituteSimpleCopyRegs
static bool substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx)
Definition: AMDGPURegisterBankInfo.cpp:1683
llvm::getDefIgnoringCopies
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:443
llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:103
Merge
R600 Clause Merge
Definition: R600ClauseMergePass.cpp:69
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm::AMDGPURegisterBankInfo::getDefaultMappingAllVGPR
const InstructionMapping & getDefaultMappingAllVGPR(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3273
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:741
getExtendOp
static unsigned getExtendOp(unsigned Opc)
Definition: AMDGPURegisterBankInfo.cpp:1643
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
llvm::LLT::getScalarSizeInBits
unsigned getScalarSizeInBits() const
Definition: LowLevelTypeImpl.h:213
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::MachineMemOperand::getAlign
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
Definition: MachineOperand.cpp:1082
llvm::MIPatternMatch::m_Reg
operand_type_match m_Reg()
Definition: MIPatternMatch.h:137
SIMachineFunctionInfo.h
llvm::RegisterBankInfo::getInstrMappingImpl
const InstructionMapping & getInstrMappingImpl(const MachineInstr &MI) const
Try to get the mapping of MI.
Definition: RegisterBankInfo.cpp:162
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
splitUnequalType
static std::pair< LLT, LLT > splitUnequalType(LLT Ty, unsigned FirstSize)
Split Ty into 2 pieces.
Definition: AMDGPURegisterBankInfo.cpp:1115
llvm::MachineInstrSpan
MachineInstrSpan provides an interface to get an iteration range containing the instruction it was in...
Definition: MachineBasicBlock.h:1181
llvm::RegisterBankInfo::OperandsMapper::getMRI
MachineRegisterInfo & getMRI() const
The MachineRegisterInfo we used to realize the mapping.
Definition: RegisterBankInfo.h:335
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:457
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::SmallVector< MachineInstr *, 4 >
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:430
llvm::LegacyLegalizeActions::Bitcast
@ Bitcast
Perform the operation on a different, but equivalently sized type.
Definition: LegacyLegalizerInfo.h:54
llvm::MipsISD::Lo
@ Lo
Definition: MipsISelLowering.h:79
llvm::RegisterBankInfo::getRegBank
RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition: RegisterBankInfo.h:432
llvm::getSrcRegIgnoringCopies
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:450
llvm::AMDGPUInstrInfo::isUniformMMO
static bool isUniformMMO(const MachineMemOperand *MMO)
Definition: AMDGPUInstrInfo.cpp:31
llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition: MachineMemOperand.h:145
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:742
llvm::AMDGPURegisterBankInfo::applyMappingImpl
void applyMappingImpl(const OperandsMapper &OpdMapper) const override
See RegisterBankInfo::applyMapping.
Definition: AMDGPURegisterBankInfo.cpp:2136
llvm::RegisterBankInfo::applyDefaultMapping
static void applyDefaultMapping(const OperandsMapper &OpdMapper)
Helper method to apply something that is like the default mapping.
Definition: RegisterBankInfo.cpp:438
llvm::AMDGPURegisterBankInfo::AMDGPURegisterBankInfo
AMDGPURegisterBankInfo(const GCNSubtarget &STI)
Definition: AMDGPURegisterBankInfo.cpp:195
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:233
llvm::SIRegisterInfo::getWaveMaskRegClass
const TargetRegisterClass * getWaveMaskRegClass() const
Definition: SIRegisterInfo.h:294
llvm::LLT::getScalarType
LLT getScalarType() const
Definition: LowLevelTypeImpl.h:168
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:128
llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition: MachineMemOperand.h:143
llvm::AMDGPURegisterBankInfo::getInstrMapping
const InstructionMapping & getInstrMapping(const MachineInstr &MI) const override
This function must return a legal mapping, because AMDGPURegisterBankInfo::getInstrAlternativeMapping...
Definition: AMDGPURegisterBankInfo.cpp:3442
llvm::MachineFunction::insert
void insert(iterator MBBI, MachineBasicBlock *MBB)
Definition: MachineFunction.h:827
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
llvm::LLT::isValid
bool isValid() const
Definition: LowLevelTypeImpl.h:117
llvm::Optional< int64_t >
llvm::RegisterBankInfo::OperandsMapper::getInstrMapping
const InstructionMapping & getInstrMapping() const
The final mapping of the instruction.
Definition: RegisterBankInfo.h:332
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::AMDGPURegisterBankInfo::getVGPROpMapping
const ValueMapping * getVGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3417
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
MachineIRBuilder.h
unpackV2S16ToS32
static std::pair< Register, Register > unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode)
Definition: AMDGPURegisterBankInfo.cpp:1661
llvm::RegisterBankInfo::InstructionMapping::isValid
bool isValid() const
Check whether this object is valid.
Definition: RegisterBankInfo.h:254
llvm::MachineMemOperand::isInvariant
bool isInvariant() const
Definition: MachineMemOperand.h:293
llvm::AMDGPURegisterBankInfo::TII
const SIInstrInfo * TII
Definition: AMDGPURegisterBankInfo.h:46
llvm::LegalizerHelper
Definition: LegalizerHelper.h:39
llvm::AMDGPURegisterBankInfo::applyMappingSBufferLoad
bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const
Definition: AMDGPURegisterBankInfo.cpp:1416
llvm::AMDGPURegisterBankInfo::TRI
const SIRegisterInfo * TRI
Definition: AMDGPURegisterBankInfo.h:45
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::RegisterBankInfo::getValueMapping
const ValueMapping & getValueMapping(unsigned StartIdx, unsigned Length, const RegisterBank &RegBank) const
The most common ValueMapping consists of a single PartialMapping.
Definition: RegisterBankInfo.cpp:297
llvm::GCNSubtarget::hasScalarCompareEq64
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:802
regBankBoolUnion
static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1)
Definition: AMDGPURegisterBankInfo.cpp:3181
llvm::AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic
RegisterBankInfo::InstructionMappings getInstrAlternativeMappingsIntrinsic(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:335
llvm::SIRegisterInfo::isAGPRClass
bool isAGPRClass(const TargetRegisterClass *RC) const
Definition: SIRegisterInfo.h:177
llvm::RegisterBankInfo::ValueMapping::BreakDown
const PartialMapping * BreakDown
How the value is broken down between the different register banks.
Definition: RegisterBankInfo.h:147
llvm::constrainSelectedInstRegOperands
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:134
llvm::AMDGPURegisterBankInfo::copyCost
unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override
Get the cost of a copy from B to A, or put differently, get the cost of A = COPY B.
Definition: AMDGPURegisterBankInfo.cpp:219
llvm::MipsISD::Hi
@ Hi
Definition: MipsISelLowering.h:75
llvm::LLT::fixed_vector
static LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelTypeImpl.h:75
llvm::MachineBasicBlock::addSuccessor
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Definition: MachineBasicBlock.cpp:746
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:636
getBaseWithConstantOffset
static std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
Definition: AMDGPURegisterBankInfo.cpp:1721
llvm::RegisterBank
This class implements the register bank concept.
Definition: RegisterBank.h:28
llvm::MachineRegisterInfo::setType
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
Definition: MachineRegisterInfo.cpp:182
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:357
GCNSubtarget.h
llvm::AMDGPURegisterBankInfo::applyMappingLoad
bool applyMappingLoad(MachineInstr &MI, const OperandsMapper &OpdMapper, MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:1140
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AMDGPURegisterBankInfo::getAGPROpMapping
const ValueMapping * getAGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3425
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition: MachineMemOperand.h:229
llvm::GCNSubtarget::useFlatForGlobal
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:464
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:724
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::MachineMemOperand::getValue
const Value * getValue() const
Return the base address of the memory access.
Definition: MachineMemOperand.h:211
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:499
llvm::AMDGPU::getMUBUFOpcode
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition: AMDGPUBaseInfo.cpp:264
llvm::Log2
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:207
llvm::LLT::getSizeInBits
TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelTypeImpl.h:153
llvm::LegalizerHelper::narrowScalar
LegalizeResult narrowScalar(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy)
Legalize an instruction by reducing the width of the underlying scalar type.
Definition: LegalizerHelper.cpp:823
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::AMDGPURegisterBankInfo::addMappingFromTable
InstructionMappings addMappingFromTable(const MachineInstr &MI, const MachineRegisterInfo &MRI, const std::array< unsigned, NumOps > RegSrcOpIdx, ArrayRef< OpRegBankEntry< NumOps >> Table) const
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::AMDGPURegisterBankInfo::getValueMappingForPtr
const ValueMapping * getValueMappingForPtr(const MachineRegisterInfo &MRI, Register Ptr) const
Return the mapping for a pointer arugment.
Definition: AMDGPURegisterBankInfo.cpp:3336
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::Instruction
Definition: Instruction.h:45
llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition: GCNSubtarget.h:1136
llvm::AMDGPURegisterBankInfo::OpRegBankEntry
Definition: AMDGPURegisterBankInfo.h:130
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:47
llvm::MachineInstrSpan::begin
MachineBasicBlock::iterator begin()
Definition: MachineBasicBlock.h:1192
llvm::AMDGPURegisterBankInfo::constrainOpWithReadfirstlane
void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const
Definition: AMDGPURegisterBankInfo.cpp:1082
isVectorRegisterBank
static bool isVectorRegisterBank(const RegisterBank &Bank)
Definition: AMDGPURegisterBankInfo.cpp:214
llvm::AMDGPURegisterBankInfo::collectWaterfallOperands
bool collectWaterfallOperands(SmallSet< Register, 4 > &SGPROperandRegs, MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef< unsigned > OpIndices) const
Definition: AMDGPURegisterBankInfo.cpp:1044
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MachineInstrSpan::end
MachineBasicBlock::iterator end()
Definition: MachineBasicBlock.h:1195
llvm::LinearPolySize< ElementCount >::getFixed
static ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:284
llvm::RegisterBankInfo::getInstrAlternativeMappings
virtual InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const
Get the alternative mappings for MI.
Definition: RegisterBankInfo.cpp:433
llvm::RegisterBankInfo::getSizeInBits
unsigned getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
Definition: RegisterBankInfo.cpp:493
llvm::RegisterBankInfo::constrainGenericRegister
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
Definition: RegisterBankInfo.cpp:132
llvm::LegalizerHelper::fewerElementsVector
LegalizeResult fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy)
Legalize a vector instruction by splitting into multiple components, each acting on the same scalar t...
Definition: LegalizerHelper.cpp:4371
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
llvm::AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects
RegisterBankInfo::InstructionMappings getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:375
llvm::MachineInstrBuilder::cloneMemRefs
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Definition: MachineInstrBuilder.h:213
llvm::RegisterBankInfo::OperandsMapper
Helper class used to get/create the virtual registers that will be used to replace the MachineOperand...
Definition: RegisterBankInfo.h:280
setRegsToType
static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef< Register > Regs, LLT NewTy)
Replace the current type each register in Regs has with NewTy.
Definition: AMDGPURegisterBankInfo.cpp:668
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::LLT::getAddressSpace
unsigned getAddressSpace() const
Definition: LowLevelTypeImpl.h:227
llvm::RegisterBankInfo::PartialMapping::StartIdx
unsigned StartIdx
Number of bits at which this partial mapping starts in the original value.
Definition: RegisterBankInfo.h:52
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:626
llvm::LegalizerHelper::lowerAbsToMaxNeg
LegalizeResult lowerAbsToMaxNeg(MachineInstr &MI)
Definition: LegalizerHelper.cpp:7417
extractCPol
static unsigned extractCPol(unsigned CachePolicy)
Definition: AMDGPURegisterBankInfo.cpp:1783
llvm::AMDGPURegisterBankInfo::splitBufferOffsets
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register Offset) const
Definition: AMDGPURegisterBankInfo.cpp:1735
llvm::RISCVFenceField::O
@ O
Definition: RISCVBaseInfo.h:192
llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:164
AMDGPURegisterBankInfo.h
llvm::AMDGPURegisterBankInfo::getRegBankFromRegClass
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
Definition: AMDGPURegisterBankInfo.cpp:276
llvm::LLT::divide
LLT divide(int Factor) const
Return a type that is Factor times smaller.
Definition: LowLevelTypeImpl.h:197
llvm::RegisterBankInfo::OperandsMapper::getVRegs
iterator_range< SmallVectorImpl< Register >::const_iterator > getVRegs(unsigned OpIdx, bool ForDebug=false) const
Get all the virtual registers required to map the OpIdx-th operand of the instruction.
Definition: RegisterBankInfo.cpp:732
llvm::MachineIRBuilder
Helper class to build MachineInstr.
Definition: MachineIRBuilder.h:212
llvm::AMDGPURegisterBankInfo::Subtarget
const GCNSubtarget & Subtarget
Definition: AMDGPURegisterBankInfo.h:44
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:69
llvm::AMDGPURegisterBankInfo::getBreakDownCost
unsigned getBreakDownCost(const ValueMapping &ValMapping, const RegisterBank *CurBank=nullptr) const override
Get the cost of using ValMapping to decompose a register.
Definition: AMDGPURegisterBankInfo.cpp:250
llvm::RegisterBankInfo::InstructionMapping
Helper class that represents how the value of an instruction may be mapped and what is the related co...
Definition: RegisterBankInfo.h:189
llvm::MachineRegisterInfo::setRegBank
void setRegBank(Register Reg, const RegisterBank &RegBank)
Set the register bank to RegBank for Reg.
Definition: MachineRegisterInfo.cpp:63
llvm::AMDGPU::getBaseWithConstantOffset
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
Returns base register and constant offset.
Definition: AMDGPUGlobalISelUtils.cpp:17
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition: MachineMemOperand.h:38
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
llvm::assumeAligned
Align assumeAligned(uint64_t Value)
Treats the value 0 as a 1, so Align is always at least 1.
Definition: Alignment.h:103
llvm::DenseMap
Definition: DenseMap.h:714
llvm::RegisterBankInfo::cannotCopy
bool cannotCopy(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const
Definition: RegisterBankInfo.h:624
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::RegisterBankInfo::getOperandsMapping
const ValueMapping * getOperandsMapping(Iterator Begin, Iterator End) const
Get the uniquely generated array of ValueMapping for the elements of between Begin and End.
Definition: RegisterBankInfo.cpp:332
llvm::LLT::isVector
bool isVector() const
Definition: LowLevelTypeImpl.h:123
llvm::LLT::getNumElements
uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelTypeImpl.h:127
llvm::MachineFunction::CreateMachineBasicBlock
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Definition: MachineFunction.cpp:414
AMDGPUGlobalISelUtils.h
llvm::AMDGPURegisterBankInfo::selectStoreIntrinsic
MachineInstr * selectStoreIntrinsic(MachineIRBuilder &B, MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:1793
llvm::AMDGPURegisterBankInfo::getMappingType
unsigned getMappingType(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3197
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:354
llvm::MachineRegisterInfo::setSimpleHint
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
Definition: MachineRegisterInfo.h:781
llvm::AMDGPURegisterBankInfo::split64BitValueForMapping
void split64BitValueForMapping(MachineIRBuilder &B, SmallVector< Register, 2 > &Regs, LLT HalfTy, Register Reg) const
Split 64-bit value Reg into two 32-bit halves and populate them into Regs.
Definition: AMDGPURegisterBankInfo.cpp:645
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::RegisterBankInfo::OperandsMapper::getMI
MachineInstr & getMI() const
Definition: RegisterBankInfo.h:329
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:225