LLVM  10.0.0svn
AMDGPURegisterBankInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
27 #include "llvm/IR/Constants.h"
28 
29 #define GET_TARGET_REGBANK_IMPL
30 #include "AMDGPUGenRegisterBank.inc"
31 
32 // This file will be TableGen'ed at some point.
33 #include "AMDGPUGenRegisterBankInfo.def"
34 
35 using namespace llvm;
36 using namespace MIPatternMatch;
37 
38 namespace {
39 
40 // Observer to apply a register bank to new registers created by LegalizerHelper.
41 class ApplyRegBankMapping final : public GISelChangeObserver {
42 private:
44  const RegisterBank *NewBank;
46 
47 public:
48  ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
49  : MRI(MRI_), NewBank(RB) {}
50 
51  ~ApplyRegBankMapping() {
52  for (MachineInstr *MI : NewInsts)
53  applyBank(*MI);
54  }
55 
56  /// Set any registers that don't have a set register class or bank to SALU.
57  void applyBank(MachineInstr &MI) {
58  for (MachineOperand &Op : MI.operands()) {
59  if (!Op.isReg())
60  continue;
61 
62  Register Reg = Op.getReg();
63  if (MRI.getRegClassOrRegBank(Reg))
64  continue;
65 
66  const RegisterBank *RB = NewBank;
67  // FIXME: This might not be enough to detect when SCC should be used.
68  if (MRI.getType(Reg) == LLT::scalar(1))
69  RB = (NewBank == &AMDGPU::SGPRRegBank ?
70  &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
71 
72  MRI.setRegBank(Reg, *RB);
73  }
74  }
75 
76  void erasingInstr(MachineInstr &MI) override {}
77 
78  void createdInstr(MachineInstr &MI) override {
79  // At this point, the instruction was just inserted and has no operands.
80  NewInsts.push_back(&MI);
81  }
82 
83  void changingInstr(MachineInstr &MI) override {}
84  void changedInstr(MachineInstr &MI) override {}
85 };
86 
87 }
90  Subtarget(ST),
91  TRI(Subtarget.getRegisterInfo()),
92  TII(Subtarget.getInstrInfo()) {
93 
94  // HACK: Until this is fully tablegen'd.
95  static bool AlreadyInit = false;
96  if (AlreadyInit)
97  return;
98 
99  AlreadyInit = true;
100 
101  const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
102  (void)RBSGPR;
103  assert(&RBSGPR == &AMDGPU::SGPRRegBank);
104 
105  const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
106  (void)RBVGPR;
107  assert(&RBVGPR == &AMDGPU::VGPRRegBank);
108 
109 }
110 
112  const RegisterBank &Src,
113  unsigned Size) const {
114  // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
115  if (Dst.getID() == AMDGPU::SGPRRegBankID &&
116  Src.getID() == AMDGPU::VGPRRegBankID) {
118  }
119 
120  // Bool values are tricky, because the meaning is based on context. The SCC
121  // and VCC banks are for the natural scalar and vector conditions produced by
122  // a compare.
123  //
124  // Legalization doesn't know about the necessary context, so an s1 use may
125  // have been a truncate from an arbitrary value, in which case a copy (lowered
126  // as a compare with 0) needs to be inserted.
127  if (Size == 1 &&
128  (Dst.getID() == AMDGPU::SCCRegBankID ||
129  Dst.getID() == AMDGPU::SGPRRegBankID) &&
130  (Src.getID() == AMDGPU::SGPRRegBankID ||
131  Src.getID() == AMDGPU::VGPRRegBankID ||
132  Src.getID() == AMDGPU::VCCRegBankID))
134 
135  if (Dst.getID() == AMDGPU::SCCRegBankID &&
136  Src.getID() == AMDGPU::VCCRegBankID)
138 
139  return RegisterBankInfo::copyCost(Dst, Src, Size);
140 }
141 
143  const ValueMapping &ValMapping,
144  const RegisterBank *CurBank) const {
145  // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
146  // VGPR.
147  // FIXME: Is there a better way to do this?
148  if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
149  return 10; // This is expensive.
150 
151  assert(ValMapping.NumBreakDowns == 2 &&
152  ValMapping.BreakDown[0].Length == 32 &&
153  ValMapping.BreakDown[0].StartIdx == 0 &&
154  ValMapping.BreakDown[1].Length == 32 &&
155  ValMapping.BreakDown[1].StartIdx == 32 &&
156  ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
157 
158  // 32-bit extract of a 64-bit value is just access of a subregister, so free.
159  // TODO: Cost of 0 hits assert, though it's not clear it's what we really
160  // want.
161 
162  // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
163  // alignment restrictions, but this probably isn't important.
164  return 1;
165 }
166 
168  const TargetRegisterClass &RC) const {
169  if (&RC == &AMDGPU::SReg_1RegClass)
170  return AMDGPU::VCCRegBank;
171 
172  return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank;
173 }
174 
175 template <unsigned NumOps>
177 AMDGPURegisterBankInfo::addMappingFromTable(
178  const MachineInstr &MI, const MachineRegisterInfo &MRI,
179  const std::array<unsigned, NumOps> RegSrcOpIdx,
180  ArrayRef<OpRegBankEntry<NumOps>> Table) const {
181 
182  InstructionMappings AltMappings;
183 
185 
186  unsigned Sizes[NumOps];
187  for (unsigned I = 0; I < NumOps; ++I) {
188  Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
189  Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
190  }
191 
192  for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
193  unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
194  Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
195  }
196 
197  // getInstrMapping's default mapping uses ID 1, so start at 2.
198  unsigned MappingID = 2;
199  for (const auto &Entry : Table) {
200  for (unsigned I = 0; I < NumOps; ++I) {
201  int OpIdx = RegSrcOpIdx[I];
202  Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
203  }
204 
205  AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
207  Operands.size()));
208  }
209 
210  return AltMappings;
211 }
212 
214 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
215  const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
216  switch (MI.getIntrinsicID()) {
217  case Intrinsic::amdgcn_readlane: {
218  static const OpRegBankEntry<3> Table[2] = {
219  // Perfectly legal.
220  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
221 
222  // Need a readfirstlane for the index.
223  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
224  };
225 
226  const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
227  return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
228  }
229  case Intrinsic::amdgcn_writelane: {
230  static const OpRegBankEntry<4> Table[4] = {
231  // Perfectly legal.
232  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
233 
234  // Need readfirstlane of first op
235  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
236 
237  // Need readfirstlane of second op
238  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
239 
240  // Need readfirstlane of both ops
241  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
242  };
243 
244  // rsrc, voffset, offset
245  const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
246  return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
247  }
248  default:
250  }
251 }
252 
254 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
255  const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
256 
257  switch (MI.getIntrinsicID()) {
258  case Intrinsic::amdgcn_buffer_load: {
259  static const OpRegBankEntry<3> Table[4] = {
260  // Perfectly legal.
261  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
262  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
263 
264  // Waterfall loop needed for rsrc. In the worst case this will execute
265  // approximately an extra 10 * wavesize + 2 instructions.
266  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
267  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
268  };
269 
270  // rsrc, voffset, offset
271  const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
272  return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
273  }
274  case Intrinsic::amdgcn_s_buffer_load: {
275  static const OpRegBankEntry<2> Table[4] = {
276  // Perfectly legal.
277  { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
278 
279  // Only need 1 register in loop
280  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
281 
282  // Have to waterfall the resource.
283  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
284 
285  // Have to waterfall the resource, and the offset.
286  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
287  };
288 
289  // rsrc, offset
290  const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
291  return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
292  }
293  case Intrinsic::amdgcn_ds_ordered_add:
294  case Intrinsic::amdgcn_ds_ordered_swap: {
295  // VGPR = M0, VGPR
296  static const OpRegBankEntry<3> Table[2] = {
297  // Perfectly legal.
298  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
299 
300  // Need a readfirstlane for m0
301  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
302  };
303 
304  const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
305  return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
306  }
307  case Intrinsic::amdgcn_s_sendmsg:
308  case Intrinsic::amdgcn_s_sendmsghalt: {
309  // FIXME: Should have no register for immediate
310  static const OpRegBankEntry<1> Table[2] = {
311  // Perfectly legal.
312  { { AMDGPU::SGPRRegBankID }, 1 },
313 
314  // Need readlane
315  { { AMDGPU::VGPRRegBankID }, 3 }
316  };
317 
318  const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
319  return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
320  }
321  default:
323  }
324 }
325 
326 // FIXME: Returns uniform if there's no source value information. This is
327 // probably wrong.
329  if (!MI.hasOneMemOperand())
330  return false;
331 
332  const MachineMemOperand *MMO = *MI.memoperands_begin();
333  return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
335 }
336 
339  const MachineInstr &MI) const {
340 
341  const MachineFunction &MF = *MI.getParent()->getParent();
342  const MachineRegisterInfo &MRI = MF.getRegInfo();
343 
344 
345  InstructionMappings AltMappings;
346  switch (MI.getOpcode()) {
347  case TargetOpcode::G_CONSTANT: {
348  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
349  if (Size == 1) {
350  static const OpRegBankEntry<1> Table[4] = {
351  { { AMDGPU::VGPRRegBankID }, 1 },
352  { { AMDGPU::SGPRRegBankID }, 1 },
353  { { AMDGPU::VCCRegBankID }, 1 },
354  { { AMDGPU::SCCRegBankID }, 1 }
355  };
356 
357  return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
358  }
359 
361  }
362  case TargetOpcode::G_FCONSTANT:
363  case TargetOpcode::G_FRAME_INDEX:
364  case TargetOpcode::G_GLOBAL_VALUE: {
365  static const OpRegBankEntry<1> Table[2] = {
366  { { AMDGPU::VGPRRegBankID }, 1 },
367  { { AMDGPU::SGPRRegBankID }, 1 }
368  };
369 
370  return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
371  }
372  case TargetOpcode::G_AND:
373  case TargetOpcode::G_OR:
374  case TargetOpcode::G_XOR: {
375  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
376 
377  if (Size == 1) {
378  // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
379  const InstructionMapping &SCCMapping = getInstructionMapping(
380  1, 1, getOperandsMapping(
381  {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
382  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
383  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
384  3); // Num Operands
385  AltMappings.push_back(&SCCMapping);
386 
387  const InstructionMapping &SGPRMapping = getInstructionMapping(
388  1, 1, getOperandsMapping(
389  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
390  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
391  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
392  3); // Num Operands
393  AltMappings.push_back(&SGPRMapping);
394 
395  const InstructionMapping &VCCMapping0 = getInstructionMapping(
396  2, 10, getOperandsMapping(
397  {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
398  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
399  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
400  3); // Num Operands
401  AltMappings.push_back(&VCCMapping0);
402  return AltMappings;
403  }
404 
405  if (Size != 64)
406  break;
407 
408  const InstructionMapping &SSMapping = getInstructionMapping(
409  1, 1, getOperandsMapping(
410  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
411  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
412  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
413  3); // Num Operands
414  AltMappings.push_back(&SSMapping);
415 
416  const InstructionMapping &VVMapping = getInstructionMapping(
417  2, 2, getOperandsMapping(
418  {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
419  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
420  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
421  3); // Num Operands
422  AltMappings.push_back(&VVMapping);
423 
424  const InstructionMapping &SVMapping = getInstructionMapping(
425  3, 3, getOperandsMapping(
426  {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
427  AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
428  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
429  3); // Num Operands
430  AltMappings.push_back(&SVMapping);
431 
432  // SGPR in LHS is slightly preferrable, so make it VS more expensive than
433  // SV.
434  const InstructionMapping &VSMapping = getInstructionMapping(
435  3, 4, getOperandsMapping(
436  {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
437  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
438  AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
439  3); // Num Operands
440  AltMappings.push_back(&VSMapping);
441  break;
442  }
443  case TargetOpcode::G_LOAD:
444  case TargetOpcode::G_ZEXTLOAD:
445  case TargetOpcode::G_SEXTLOAD: {
446  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
447  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
448  unsigned PtrSize = PtrTy.getSizeInBits();
449  unsigned AS = PtrTy.getAddressSpace();
450  LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
452  AS != AMDGPUAS::PRIVATE_ADDRESS) &&
454  const InstructionMapping &SSMapping = getInstructionMapping(
455  1, 1, getOperandsMapping(
456  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
457  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
458  2); // Num Operands
459  AltMappings.push_back(&SSMapping);
460  }
461 
462  const InstructionMapping &VVMapping = getInstructionMapping(
463  2, 1, getOperandsMapping(
464  {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
465  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
466  2); // Num Operands
467  AltMappings.push_back(&VVMapping);
468 
469  // It may be possible to have a vgpr = load sgpr mapping here, because
470  // the mubuf instructions support this kind of load, but probably for only
471  // gfx7 and older. However, the addressing mode matching in the instruction
472  // selector should be able to do a better job of detecting and selecting
473  // these kinds of loads from the vgpr = load vgpr mapping.
474 
475  return AltMappings;
476 
477  }
478  case TargetOpcode::G_ICMP: {
479  unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
480  const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
481  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
482  nullptr, // Predicate operand.
483  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
484  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
485  4); // Num Operands
486  AltMappings.push_back(&SSMapping);
487 
488  const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
489  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
490  nullptr, // Predicate operand.
491  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
492  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
493  4); // Num Operands
494  AltMappings.push_back(&SVMapping);
495 
496  const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
497  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
498  nullptr, // Predicate operand.
499  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
500  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
501  4); // Num Operands
502  AltMappings.push_back(&VSMapping);
503 
504  const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
505  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
506  nullptr, // Predicate operand.
507  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
508  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
509  4); // Num Operands
510  AltMappings.push_back(&VVMapping);
511 
512  return AltMappings;
513  }
514  case TargetOpcode::G_SELECT: {
515  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
516  const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
517  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
518  AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
519  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
520  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
521  4); // Num Operands
522  AltMappings.push_back(&SSMapping);
523 
524  const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
525  getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
526  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
527  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
528  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
529  4); // Num Operands
530  AltMappings.push_back(&VVMapping);
531 
532  return AltMappings;
533  }
534  case TargetOpcode::G_SMIN:
535  case TargetOpcode::G_SMAX:
536  case TargetOpcode::G_UMIN:
537  case TargetOpcode::G_UMAX: {
538  static const OpRegBankEntry<3> Table[4] = {
539  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
540  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
541  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
542 
543  // Scalar requires cmp+select, and extends if 16-bit.
544  // FIXME: Should there be separate costs for 32 and 16-bit
545  { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
546  };
547 
548  const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
549  return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
550  }
551  case TargetOpcode::G_UADDE:
552  case TargetOpcode::G_USUBE:
553  case TargetOpcode::G_SADDE:
554  case TargetOpcode::G_SSUBE: {
555  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
556  const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
558  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
559  AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
560  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
561  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
562  AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
563  5); // Num Operands
564  AltMappings.push_back(&SSMapping);
565 
566  const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
567  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
568  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
569  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
570  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
571  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
572  5); // Num Operands
573  AltMappings.push_back(&VVMapping);
574  return AltMappings;
575  }
576  case AMDGPU::G_BRCOND: {
577  assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
578 
579  const InstructionMapping &SMapping = getInstructionMapping(
580  1, 1, getOperandsMapping(
581  {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
582  2); // Num Operands
583  AltMappings.push_back(&SMapping);
584 
585  const InstructionMapping &VMapping = getInstructionMapping(
586  1, 1, getOperandsMapping(
587  {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
588  2); // Num Operands
589  AltMappings.push_back(&VMapping);
590  return AltMappings;
591  }
592  case AMDGPU::G_INTRINSIC:
593  return getInstrAlternativeMappingsIntrinsic(MI, MRI);
594  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
595  return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
596  default:
597  break;
598  }
600 }
601 
602 void AMDGPURegisterBankInfo::split64BitValueForMapping(
605  LLT HalfTy,
606  Register Reg) const {
607  assert(HalfTy.getSizeInBits() == 32);
608  MachineRegisterInfo *MRI = B.getMRI();
609  Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
610  Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
611  const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
612  MRI->setRegBank(LoLHS, *Bank);
613  MRI->setRegBank(HiLHS, *Bank);
614 
615  Regs.push_back(LoLHS);
616  Regs.push_back(HiLHS);
617 
618  B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
619  .addDef(LoLHS)
620  .addDef(HiLHS)
621  .addUse(Reg);
622 }
623 
624 /// Replace the current type each register in \p Regs has with \p NewTy
626  LLT NewTy) {
627  for (Register Reg : Regs) {
628  assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
629  MRI.setType(Reg, NewTy);
630  }
631 }
632 
634  if (Ty.isVector()) {
635  assert(Ty.getNumElements() % 2 == 0);
636  return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
637  }
638 
639  assert(Ty.getSizeInBits() % 2 == 0);
640  return LLT::scalar(Ty.getSizeInBits() / 2);
641 }
642 
643 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
644 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
645 /// execute the instruction for each unique combination of values in all lanes
646 /// in the wave. The block will be split such that rest of the instructions are
647 /// moved to a new block.
648 ///
649 /// Essentially performs this loop:
650 //
651 /// Save Execution Mask
652 /// For (Lane : Wavefront) {
653 /// Enable Lane, Disable all other lanes
654 /// SGPR = read SGPR value for current lane from VGPR
655 /// VGPRResult[Lane] = use_op SGPR
656 /// }
657 /// Restore Execution Mask
658 ///
659 /// There is additional complexity to try for compare values to identify the
660 /// unique values used.
661 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
662  MachineIRBuilder &B,
664  SmallSet<Register, 4> &SGPROperandRegs,
665  MachineRegisterInfo &MRI) const {
666  SmallVector<Register, 4> ResultRegs;
667  SmallVector<Register, 4> InitResultRegs;
668  SmallVector<Register, 4> PhiRegs;
669 
670  MachineBasicBlock &MBB = B.getMBB();
671  MachineFunction *MF = &B.getMF();
672 
673  const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
674  const unsigned WaveAndOpc = Subtarget.isWave32() ?
675  AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
676  const unsigned MovTermOpc = Subtarget.isWave32() ?
677  AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
678  const unsigned XorTermOpc = Subtarget.isWave32() ?
679  AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
680  const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
681  AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
682  const unsigned ExecReg = Subtarget.isWave32() ?
683  AMDGPU::EXEC_LO : AMDGPU::EXEC;
684 
685  for (MachineInstr &MI : Range) {
686  for (MachineOperand &Def : MI.defs()) {
687  LLT ResTy = MRI.getType(Def.getReg());
688  const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
689  ResultRegs.push_back(Def.getReg());
690  Register InitReg = B.buildUndef(ResTy).getReg(0);
691  Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
692  InitResultRegs.push_back(InitReg);
693  PhiRegs.push_back(PhiReg);
694  MRI.setRegBank(PhiReg, *DefBank);
695  MRI.setRegBank(InitReg, *DefBank);
696  }
697  }
698 
699  Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
700  Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
701 
702  // Don't bother using generic instructions/registers for the exec mask.
703  B.buildInstr(TargetOpcode::IMPLICIT_DEF)
704  .addDef(InitSaveExecReg);
705 
706  Register PhiExec = MRI.createVirtualRegister(WaveRC);
707  Register NewExec = MRI.createVirtualRegister(WaveRC);
708 
709  // To insert the loop we need to split the block. Move everything before this
710  // point to a new block, and insert a new empty block before this instruction.
712  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
713  MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
714  MachineFunction::iterator MBBI(MBB);
715  ++MBBI;
716  MF->insert(MBBI, LoopBB);
717  MF->insert(MBBI, RestoreExecBB);
718  MF->insert(MBBI, RemainderBB);
719 
720  LoopBB->addSuccessor(RestoreExecBB);
721  LoopBB->addSuccessor(LoopBB);
722 
723  // Move the rest of the block into a new block.
724  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
725  RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
726 
727  MBB.addSuccessor(LoopBB);
728  RestoreExecBB->addSuccessor(RemainderBB);
729 
730  B.setInsertPt(*LoopBB, LoopBB->end());
731 
732  B.buildInstr(TargetOpcode::PHI)
733  .addDef(PhiExec)
734  .addReg(InitSaveExecReg)
735  .addMBB(&MBB)
736  .addReg(NewExec)
737  .addMBB(LoopBB);
738 
739  for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
740  B.buildInstr(TargetOpcode::G_PHI)
741  .addDef(std::get<2>(Result))
742  .addReg(std::get<0>(Result)) // Initial value / implicit_def
743  .addMBB(&MBB)
744  .addReg(std::get<1>(Result)) // Mid-loop value.
745  .addMBB(LoopBB);
746  }
747 
748  const DebugLoc &DL = B.getDL();
749 
750  // Figure out the iterator range after splicing the instructions.
751  auto NewBegin = std::prev(LoopBB->end());
752 
753  // Move the instruction into the loop. Note we moved everything after
754  // Range.end() already into a new block, so Range.end() is no longer valid.
755  LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
756 
757  auto NewEnd = LoopBB->end();
758 
759  MachineBasicBlock::iterator I = Range.begin();
760  B.setInsertPt(*LoopBB, I);
761 
762  Register CondReg;
763 
764  for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
765  for (MachineOperand &Op : MI.uses()) {
766  if (!Op.isReg() || Op.isDef())
767  continue;
768 
769  if (SGPROperandRegs.count(Op.getReg())) {
770  LLT OpTy = MRI.getType(Op.getReg());
771  unsigned OpSize = OpTy.getSizeInBits();
772 
773  // Can only do a readlane of 32-bit pieces.
774  if (OpSize == 32) {
775  // Avoid extra copies in the simple case of one 32-bit register.
776  Register CurrentLaneOpReg
777  = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
778  MRI.setType(CurrentLaneOpReg, OpTy);
779 
780  constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
781  // Read the next variant <- also loop target.
782  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
783  CurrentLaneOpReg)
784  .addReg(Op.getReg());
785 
786  Register NewCondReg = MRI.createVirtualRegister(WaveRC);
787  bool First = CondReg == AMDGPU::NoRegister;
788  if (First)
789  CondReg = NewCondReg;
790 
791  // Compare the just read M0 value to all possible Idx values.
792  B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
793  .addDef(NewCondReg)
794  .addReg(CurrentLaneOpReg)
795  .addReg(Op.getReg());
796  Op.setReg(CurrentLaneOpReg);
797 
798  if (!First) {
799  Register AndReg = MRI.createVirtualRegister(WaveRC);
800 
801  // If there are multiple operands to consider, and the conditions.
802  B.buildInstr(WaveAndOpc)
803  .addDef(AndReg)
804  .addReg(NewCondReg)
805  .addReg(CondReg);
806  CondReg = AndReg;
807  }
808  } else {
809  LLT S32 = LLT::scalar(32);
810  SmallVector<Register, 8> ReadlanePieces;
811 
812  // The compares can be done as 64-bit, but the extract needs to be done
813  // in 32-bit pieces.
814 
815  bool Is64 = OpSize % 64 == 0;
816 
817  LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
818  unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
819  : AMDGPU::V_CMP_EQ_U32_e64;
820 
821  // The compares can be done as 64-bit, but the extract needs to be done
822  // in 32-bit pieces.
823 
824  // Insert the unmerge before the loop.
825 
826  B.setMBB(MBB);
827  auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
828  B.setInstr(*I);
829 
830  unsigned NumPieces = Unmerge->getNumOperands() - 1;
831  for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
832  Register UnmergePiece = Unmerge.getReg(PieceIdx);
833 
834  Register CurrentLaneOpReg;
835  if (Is64) {
836  Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
837  Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
838 
839  MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
840  MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
841  MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
842 
843  // Read the next variant <- also loop target.
844  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
845  CurrentLaneOpRegLo)
846  .addReg(UnmergePiece, 0, AMDGPU::sub0);
847 
848  // Read the next variant <- also loop target.
849  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
850  CurrentLaneOpRegHi)
851  .addReg(UnmergePiece, 0, AMDGPU::sub1);
852 
853  CurrentLaneOpReg =
854  B.buildMerge(LLT::scalar(64),
855  {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
856  .getReg(0);
857 
858  MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
859 
860  if (OpTy.getScalarSizeInBits() == 64) {
861  // If we need to produce a 64-bit element vector, so use the
862  // merged pieces
863  ReadlanePieces.push_back(CurrentLaneOpReg);
864  } else {
865  // 32-bit element type.
866  ReadlanePieces.push_back(CurrentLaneOpRegLo);
867  ReadlanePieces.push_back(CurrentLaneOpRegHi);
868  }
869  } else {
870  CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
871  MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
872  MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
873 
874  // Read the next variant <- also loop target.
875  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
876  CurrentLaneOpReg)
877  .addReg(UnmergePiece);
878  ReadlanePieces.push_back(CurrentLaneOpReg);
879  }
880 
881  Register NewCondReg = MRI.createVirtualRegister(WaveRC);
882  bool First = CondReg == AMDGPU::NoRegister;
883  if (First)
884  CondReg = NewCondReg;
885 
886  B.buildInstr(CmpOp)
887  .addDef(NewCondReg)
888  .addReg(CurrentLaneOpReg)
889  .addReg(UnmergePiece);
890 
891  if (!First) {
892  Register AndReg = MRI.createVirtualRegister(WaveRC);
893 
894  // If there are multiple operands to consider, and the conditions.
895  B.buildInstr(WaveAndOpc)
896  .addDef(AndReg)
897  .addReg(NewCondReg)
898  .addReg(CondReg);
899  CondReg = AndReg;
900  }
901  }
902 
903  // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
904  // BUILD_VECTOR
905  if (OpTy.isVector()) {
906  auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
907  Op.setReg(Merge.getReg(0));
908  } else {
909  auto Merge = B.buildMerge(OpTy, ReadlanePieces);
910  Op.setReg(Merge.getReg(0));
911  }
912 
913  MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
914  }
915  }
916  }
917  }
918 
919  B.setInsertPt(*LoopBB, LoopBB->end());
920 
921  // Update EXEC, save the original EXEC value to VCC.
922  B.buildInstr(AndSaveExecOpc)
923  .addDef(NewExec)
924  .addReg(CondReg, RegState::Kill);
925 
926  MRI.setSimpleHint(NewExec, CondReg);
927 
928  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
929  B.buildInstr(XorTermOpc)
930  .addDef(ExecReg)
931  .addReg(ExecReg)
932  .addReg(NewExec);
933 
934  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
935  // s_cbranch_scc0?
936 
937  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
938  B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
939  .addMBB(LoopBB);
940 
941  // Save the EXEC mask before the loop.
942  BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
943  .addReg(ExecReg);
944 
945  // Restore the EXEC mask after the loop.
946  B.setMBB(*RestoreExecBB);
947  B.buildInstr(MovTermOpc)
948  .addDef(ExecReg)
949  .addReg(SaveExecReg);
950 
951  // Restore the insert point before the original instruction.
952  B.setInsertPt(MBB, MBB.end());
953 
954  return true;
955 }
956 
957 // Return any unique registers used by \p MI at \p OpIndices that need to be
958 // handled in a waterfall loop. Returns these registers in \p
959 // SGPROperandRegs. Returns true if there are any operansd to handle and a
960 // waterfall loop is necessary.
961 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
962  SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
963  MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
964  for (unsigned Op : OpIndices) {
965  assert(MI.getOperand(Op).isUse());
966  Register Reg = MI.getOperand(Op).getReg();
967  const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
968  if (OpBank->getID() == AMDGPU::VGPRRegBankID)
969  SGPROperandRegs.insert(Reg);
970  }
971 
972  // No operands need to be replaced, so no need to loop.
973  return !SGPROperandRegs.empty();
974 }
975 
976 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
978  ArrayRef<unsigned> OpIndices) const {
979  // Use a set to avoid extra readfirstlanes in the case where multiple operands
980  // are the same register.
981  SmallSet<Register, 4> SGPROperandRegs;
982 
983  if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
984  return false;
985 
987  return executeInWaterfallLoop(B, make_range(I, std::next(I)),
988  SGPROperandRegs, MRI);
989 }
990 
991 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
993  ArrayRef<unsigned> OpIndices) const {
994  MachineIRBuilder B(MI);
995  return executeInWaterfallLoop(B, MI, MRI, OpIndices);
996 }
997 
998 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
999 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1000  MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1001  Register Reg = MI.getOperand(OpIdx).getReg();
1002  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1003  if (Bank != &AMDGPU::VGPRRegBank)
1004  return;
1005 
1006  MachineIRBuilder B(MI);
1007  Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1008  B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1009  .addDef(SGPR)
1010  .addReg(Reg);
1011 
1012  const TargetRegisterClass *Constrained =
1013  constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1014  (void)Constrained;
1015  assert(Constrained && "Failed to constrain readfirstlane src reg");
1016 
1017  MI.getOperand(OpIdx).setReg(SGPR);
1018 }
1019 
1020 // When regbankselect repairs registers, it will insert a repair instruction
1021 // which defines the repaired register. Then it calls applyMapping and expects
1022 // that the targets will either delete or rewrite the originally wrote to the
1023 // repaired registers. Beccause of this, we end up in a situation where
1024 // we have 2 instructions defining the same registers.
1026  Register Reg,
1027  const MachineInstr &MI) {
1028  // Is there some way we can assert that there are exactly 2 def instructions?
1029  for (MachineInstr &Other : MRI.def_instructions(Reg)) {
1030  if (&Other != &MI)
1031  return &Other;
1032  }
1033 
1034  return nullptr;
1035 }
1036 
1037 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
1038  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1039  MachineRegisterInfo &MRI) const {
1040  Register DstReg = MI.getOperand(0).getReg();
1041  const LLT LoadTy = MRI.getType(DstReg);
1042  unsigned LoadSize = LoadTy.getSizeInBits();
1043  const unsigned MaxNonSmrdLoadSize = 128;
1044  // 128-bit loads are supported for all instruction types.
1045  if (LoadSize <= MaxNonSmrdLoadSize)
1046  return false;
1047 
1048  SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
1049  SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
1050 
1051  // If the pointer is an SGPR, we have nothing to do.
1052  if (SrcRegs.empty()) {
1053  Register PtrReg = MI.getOperand(1).getReg();
1054  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
1055  if (PtrBank == &AMDGPU::SGPRRegBank)
1056  return false;
1057  SrcRegs.push_back(PtrReg);
1058  }
1059 
1060  assert(LoadSize % MaxNonSmrdLoadSize == 0);
1061 
1062  // We want to get the repair instruction now, because it will help us
1063  // determine which instruction the legalizer inserts that will also
1064  // write to DstReg.
1065  MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
1066 
1067  // RegBankSelect only emits scalar types, so we need to reset the pointer
1068  // operand to a pointer type.
1069  Register BasePtrReg = SrcRegs[0];
1070  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1071  MRI.setType(BasePtrReg, PtrTy);
1072 
1073  MachineIRBuilder B(MI);
1074 
1075  unsigned SplitElts =
1076  MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
1077  const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
1078  ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
1079  GISelObserverWrapper Observer(&O);
1080  B.setChangeObserver(Observer);
1081  LegalizerHelper Helper(B.getMF(), Observer, B);
1082  if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1083  return false;
1084 
1085  // At this point, the legalizer has split the original load into smaller
1086  // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
1087  // that combines the outputs of the lower loads and writes it to DstReg.
1088  // The register bank selector has also added the RepairInst which writes to
1089  // DstReg as well.
1090 
1091  MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
1092 
1093  // Replace the output of the LegalizedInst with a temporary register, since
1094  // RepairInst already defines DstReg.
1095  Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1096  LegalizedInst->getOperand(0).setReg(TmpReg);
1097  B.setInsertPt(*RepairInst->getParent(), RepairInst);
1098 
1099  for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1101  B.buildConstant(IdxReg, DefIdx);
1102  MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
1103  B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1104  }
1105 
1106  MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1107  return true;
1108 }
1109 
1110 bool AMDGPURegisterBankInfo::applyMappingImage(
1111  MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1112  MachineRegisterInfo &MRI, int RsrcIdx) const {
1113  const int NumDefs = MI.getNumExplicitDefs();
1114 
1115  // The reported argument index is relative to the IR intrinsic call arguments,
1116  // so we need to shift by the number of defs and the intrinsic ID.
1117  RsrcIdx += NumDefs + 1;
1118 
1119  // Insert copies to VGPR arguments.
1120  applyDefaultMapping(OpdMapper);
1121 
1122  // Fixup any SGPR arguments.
1123  SmallVector<unsigned, 4> SGPRIndexes;
1124  for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1125  if (!MI.getOperand(I).isReg())
1126  continue;
1127 
1128  // If this intrinsic has a sampler, it immediately follows rsrc.
1129  if (I == RsrcIdx || I == RsrcIdx + 1)
1130  SGPRIndexes.push_back(I);
1131  }
1132 
1133  executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1134  return true;
1135 }
1136 
1137 // For cases where only a single copy is inserted for matching register banks.
1138 // Replace the register in the instruction operand
1140  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1141  SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1142  if (!SrcReg.empty()) {
1143  assert(SrcReg.size() == 1);
1144  OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1145  }
1146 }
1147 
1148 /// Handle register layout difference for f16 images for some subtargets.
1149 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1150  MachineRegisterInfo &MRI,
1151  Register Reg) const {
1152  if (!Subtarget.hasUnpackedD16VMem())
1153  return Reg;
1154 
1155  const LLT S16 = LLT::scalar(16);
1156  LLT StoreVT = MRI.getType(Reg);
1157  if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1158  return Reg;
1159 
1160  auto Unmerge = B.buildUnmerge(S16, Reg);
1161 
1162 
1163  SmallVector<Register, 4> WideRegs;
1164  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1165  WideRegs.push_back(Unmerge.getReg(I));
1166 
1167  const LLT S32 = LLT::scalar(32);
1168  int NumElts = StoreVT.getNumElements();
1169 
1170  return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1171 }
1172 
1173 static std::pair<Register, unsigned>
1175  int64_t Const;
1176  if (mi_match(Reg, MRI, m_ICst(Const)))
1177  return std::make_pair(Register(), Const);
1178 
1179  Register Base;
1180  if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1181  return std::make_pair(Base, Const);
1182 
1183  // TODO: Handle G_OR used for add case
1184  return std::make_pair(Reg, 0);
1185 }
1186 
1187 std::pair<Register, unsigned>
1188 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1189  Register OrigOffset) const {
1190  const unsigned MaxImm = 4095;
1191  Register BaseReg;
1192  unsigned ImmOffset;
1193  const LLT S32 = LLT::scalar(32);
1194 
1195  std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1196  OrigOffset);
1197 
1198  unsigned C1 = 0;
1199  if (ImmOffset != 0) {
1200  // If the immediate value is too big for the immoffset field, put the value
1201  // and -4096 into the immoffset field so that the value that is copied/added
1202  // for the voffset field is a multiple of 4096, and it stands more chance
1203  // of being CSEd with the copy/add for another similar load/store.
1204  // However, do not do that rounding down to a multiple of 4096 if that is a
1205  // negative number, as it appears to be illegal to have a negative offset
1206  // in the vgpr, even if adding the immediate offset makes it positive.
1207  unsigned Overflow = ImmOffset & ~MaxImm;
1208  ImmOffset -= Overflow;
1209  if ((int32_t)Overflow < 0) {
1210  Overflow += ImmOffset;
1211  ImmOffset = 0;
1212  }
1213 
1214  C1 = ImmOffset;
1215  if (Overflow != 0) {
1216  if (!BaseReg)
1217  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1218  else {
1219  auto OverflowVal = B.buildConstant(S32, Overflow);
1220  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1221  }
1222  }
1223  }
1224 
1225  if (!BaseReg)
1226  BaseReg = B.buildConstant(S32, 0).getReg(0);
1227 
1228  return {BaseReg, C1};
1229 }
1230 
1231 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1232  int64_t C;
1233  return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1234 }
1235 
1236 static unsigned extractGLC(unsigned CachePolicy) {
1237  return CachePolicy & 1;
1238 }
1239 
1240 static unsigned extractSLC(unsigned CachePolicy) {
1241  return (CachePolicy >> 1) & 1;
1242 }
1243 
1244 static unsigned extractDLC(unsigned CachePolicy) {
1245  return (CachePolicy >> 2) & 1;
1246 }
1247 
1248 MachineInstr *
1249 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1250  MachineInstr &MI) const {
1251  MachineRegisterInfo &MRI = *B.getMRI();
1252  executeInWaterfallLoop(B, MI, MRI, {2, 4});
1253 
1254  // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1255 
1256  Register VData = MI.getOperand(1).getReg();
1257  LLT Ty = MRI.getType(VData);
1258 
1259  int EltSize = Ty.getScalarSizeInBits();
1260  int Size = Ty.getSizeInBits();
1261 
1262  // FIXME: Broken integer truncstore.
1263  if (EltSize != 32)
1264  report_fatal_error("unhandled intrinsic store");
1265 
1266  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1267  const int MemSize = (*MI.memoperands_begin())->getSize();
1268 
1269 
1270  Register RSrc = MI.getOperand(2).getReg();
1271  Register VOffset = MI.getOperand(3).getReg();
1272  Register SOffset = MI.getOperand(4).getReg();
1273  unsigned CachePolicy = MI.getOperand(5).getImm();
1274 
1275  unsigned ImmOffset;
1276  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1277 
1278  const bool Offen = !isZero(VOffset, MRI);
1279 
1280  unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1281  switch (8 * MemSize) {
1282  case 8:
1283  Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1284  AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1285  break;
1286  case 16:
1287  Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1288  AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1289  break;
1290  default:
1291  Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1292  AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1293  if (Size > 32)
1294  Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1295  break;
1296  }
1297 
1298 
1299  // Set the insertion point back to the instruction in case it was moved into a
1300  // loop.
1301  B.setInstr(MI);
1302 
1303  MachineInstrBuilder MIB = B.buildInstr(Opc)
1304  .addUse(VData);
1305 
1306  if (Offen)
1307  MIB.addUse(VOffset);
1308 
1309  MIB.addUse(RSrc)
1310  .addUse(SOffset)
1311  .addImm(ImmOffset)
1312  .addImm(extractGLC(CachePolicy))
1313  .addImm(extractSLC(CachePolicy))
1314  .addImm(0) // tfe: FIXME: Remove from inst
1315  .addImm(extractDLC(CachePolicy))
1316  .cloneMemRefs(MI);
1317 
1318  // FIXME: We need a way to report failure from applyMappingImpl.
1319  // Insert constrain copies before inserting the loop.
1320  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1321  report_fatal_error("failed to constrain selected store intrinsic");
1322 
1323  return MIB;
1324 }
1325 
1326 void AMDGPURegisterBankInfo::applyMappingImpl(
1327  const OperandsMapper &OpdMapper) const {
1328  MachineInstr &MI = OpdMapper.getMI();
1329  unsigned Opc = MI.getOpcode();
1330  MachineRegisterInfo &MRI = OpdMapper.getMRI();
1331  switch (Opc) {
1332  case AMDGPU::G_SELECT: {
1333  Register DstReg = MI.getOperand(0).getReg();
1334  LLT DstTy = MRI.getType(DstReg);
1335  if (DstTy.getSizeInBits() != 64)
1336  break;
1337 
1338  LLT HalfTy = getHalfSizedType(DstTy);
1339 
1340  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1341  SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
1342  SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1343  SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1344 
1345  // All inputs are SGPRs, nothing special to do.
1346  if (DefRegs.empty()) {
1347  assert(Src1Regs.empty() && Src2Regs.empty());
1348  break;
1349  }
1350 
1351  MachineIRBuilder B(MI);
1352  if (Src0Regs.empty())
1353  Src0Regs.push_back(MI.getOperand(1).getReg());
1354  else {
1355  assert(Src0Regs.size() == 1);
1356  }
1357 
1358  if (Src1Regs.empty())
1359  split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1360  else {
1361  setRegsToType(MRI, Src1Regs, HalfTy);
1362  }
1363 
1364  if (Src2Regs.empty())
1365  split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1366  else
1367  setRegsToType(MRI, Src2Regs, HalfTy);
1368 
1369  setRegsToType(MRI, DefRegs, HalfTy);
1370 
1371  B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
1372  B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
1373 
1374  MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1375  MI.eraseFromParent();
1376  return;
1377  }
1378  case AMDGPU::G_AND:
1379  case AMDGPU::G_OR:
1380  case AMDGPU::G_XOR: {
1381  // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1382  // there is a VGPR input.
1383  Register DstReg = MI.getOperand(0).getReg();
1384  LLT DstTy = MRI.getType(DstReg);
1385  if (DstTy.getSizeInBits() != 64)
1386  break;
1387 
1388  LLT HalfTy = getHalfSizedType(DstTy);
1389  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1390  SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
1391  SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1392 
1393  // All inputs are SGPRs, nothing special to do.
1394  if (DefRegs.empty()) {
1395  assert(Src0Regs.empty() && Src1Regs.empty());
1396  break;
1397  }
1398 
1399  assert(DefRegs.size() == 2);
1400  assert(Src0Regs.size() == Src1Regs.size() &&
1401  (Src0Regs.empty() || Src0Regs.size() == 2));
1402 
1403  // Depending on where the source registers came from, the generic code may
1404  // have decided to split the inputs already or not. If not, we still need to
1405  // extract the values.
1406  MachineIRBuilder B(MI);
1407 
1408  if (Src0Regs.empty())
1409  split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
1410  else
1411  setRegsToType(MRI, Src0Regs, HalfTy);
1412 
1413  if (Src1Regs.empty())
1414  split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1415  else
1416  setRegsToType(MRI, Src1Regs, HalfTy);
1417 
1418  setRegsToType(MRI, DefRegs, HalfTy);
1419 
1420  B.buildInstr(Opc)
1421  .addDef(DefRegs[0])
1422  .addUse(Src0Regs[0])
1423  .addUse(Src1Regs[0]);
1424 
1425  B.buildInstr(Opc)
1426  .addDef(DefRegs[1])
1427  .addUse(Src0Regs[1])
1428  .addUse(Src1Regs[1]);
1429 
1430  MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1431  MI.eraseFromParent();
1432  return;
1433  }
1434  case AMDGPU::G_ADD:
1435  case AMDGPU::G_SUB:
1436  case AMDGPU::G_MUL: {
1437  Register DstReg = MI.getOperand(0).getReg();
1438  LLT DstTy = MRI.getType(DstReg);
1439  if (DstTy != LLT::scalar(16))
1440  break;
1441 
1442  const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1443  if (DstBank == &AMDGPU::VGPRRegBank)
1444  break;
1445 
1446  // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1447  MachineFunction *MF = MI.getParent()->getParent();
1448  MachineIRBuilder B(MI);
1449  ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1450  GISelObserverWrapper Observer(&ApplySALU);
1451  LegalizerHelper Helper(*MF, Observer, B);
1452 
1453  if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1455  llvm_unreachable("widen scalar should have succeeded");
1456  return;
1457  }
1458  case AMDGPU::G_SMIN:
1459  case AMDGPU::G_SMAX:
1460  case AMDGPU::G_UMIN:
1461  case AMDGPU::G_UMAX: {
1462  Register DstReg = MI.getOperand(0).getReg();
1463  const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1464  if (DstBank == &AMDGPU::VGPRRegBank)
1465  break;
1466 
1467  MachineFunction *MF = MI.getParent()->getParent();
1468  MachineIRBuilder B(MI);
1469  ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1470  GISelObserverWrapper Observer(&ApplySALU);
1471  LegalizerHelper Helper(*MF, Observer, B);
1472 
1473  // Turn scalar min/max into a compare and select.
1474  LLT Ty = MRI.getType(DstReg);
1475  LLT S32 = LLT::scalar(32);
1476  LLT S16 = LLT::scalar(16);
1477 
1478  if (Ty == S16) {
1479  // Need to widen to s32, and expand as cmp + select.
1480  if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1481  llvm_unreachable("widenScalar should have succeeded");
1482 
1483  // FIXME: This is relying on widenScalar leaving MI in place.
1484  if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
1485  llvm_unreachable("lower should have succeeded");
1486  } else {
1487  if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
1488  llvm_unreachable("lower should have succeeded");
1489  }
1490 
1491  return;
1492  }
1493  case AMDGPU::G_SEXT:
1494  case AMDGPU::G_ZEXT: {
1495  Register SrcReg = MI.getOperand(1).getReg();
1496  LLT SrcTy = MRI.getType(SrcReg);
1497  bool Signed = Opc == AMDGPU::G_SEXT;
1498 
1499  MachineIRBuilder B(MI);
1500  const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1501 
1502  Register DstReg = MI.getOperand(0).getReg();
1503  LLT DstTy = MRI.getType(DstReg);
1504  if (DstTy.isScalar() &&
1505  SrcBank != &AMDGPU::SGPRRegBank &&
1506  SrcBank != &AMDGPU::SCCRegBank &&
1507  SrcBank != &AMDGPU::VCCRegBank &&
1508  // FIXME: Should handle any type that round to s64 when irregular
1509  // breakdowns supported.
1510  DstTy.getSizeInBits() == 64 &&
1511  SrcTy.getSizeInBits() <= 32) {
1512  const LLT S32 = LLT::scalar(32);
1513  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1514 
1515  // Extend to 32-bit, and then extend the low half.
1516  if (Signed) {
1517  // TODO: Should really be buildSExtOrCopy
1518  B.buildSExtOrTrunc(DefRegs[0], SrcReg);
1519 
1520  // Replicate sign bit from 32-bit extended part.
1521  auto ShiftAmt = B.buildConstant(S32, 31);
1522  MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1523  B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
1524  } else {
1525  B.buildZExtOrTrunc(DefRegs[0], SrcReg);
1526  B.buildConstant(DefRegs[1], 0);
1527  }
1528 
1529  MRI.setRegBank(DstReg, *SrcBank);
1530  MI.eraseFromParent();
1531  return;
1532  }
1533 
1534  if (SrcTy != LLT::scalar(1))
1535  return;
1536 
1537  if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
1538  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1539 
1540  const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
1541  &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
1542 
1543  unsigned DstSize = DstTy.getSizeInBits();
1544  // 64-bit select is SGPR only
1545  const bool UseSel64 = DstSize > 32 &&
1546  SrcBank->getID() == AMDGPU::SCCRegBankID;
1547 
1548  // TODO: Should s16 select be legal?
1549  LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
1550  auto True = B.buildConstant(SelType, Signed ? -1 : 1);
1551  auto False = B.buildConstant(SelType, 0);
1552 
1553  MRI.setRegBank(True.getReg(0), *DstBank);
1554  MRI.setRegBank(False.getReg(0), *DstBank);
1555  MRI.setRegBank(DstReg, *DstBank);
1556 
1557  if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
1558  B.buildSelect(DefRegs[0], SrcReg, True, False);
1559  B.buildCopy(DefRegs[1], DefRegs[0]);
1560  } else if (DstSize < 32) {
1561  auto Sel = B.buildSelect(SelType, SrcReg, True, False);
1562  MRI.setRegBank(Sel.getReg(0), *DstBank);
1563  B.buildTrunc(DstReg, Sel);
1564  } else {
1565  B.buildSelect(DstReg, SrcReg, True, False);
1566  }
1567 
1568  MI.eraseFromParent();
1569  return;
1570  }
1571 
1572  // Fixup the case with an s1 src that isn't a condition register. Use shifts
1573  // instead of introducing a compare to avoid an unnecessary condition
1574  // register (and since there's no scalar 16-bit compares).
1575  auto Ext = B.buildAnyExt(DstTy, SrcReg);
1576  auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
1577  auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
1578 
1579  if (MI.getOpcode() == AMDGPU::G_SEXT)
1580  B.buildAShr(DstReg, Shl, ShiftAmt);
1581  else
1582  B.buildLShr(DstReg, Shl, ShiftAmt);
1583 
1584  MRI.setRegBank(DstReg, *SrcBank);
1585  MRI.setRegBank(Ext.getReg(0), *SrcBank);
1586  MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1587  MRI.setRegBank(Shl.getReg(0), *SrcBank);
1588  MI.eraseFromParent();
1589  return;
1590  }
1591  case AMDGPU::G_BUILD_VECTOR:
1592  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
1593  Register DstReg = MI.getOperand(0).getReg();
1594  LLT DstTy = MRI.getType(DstReg);
1595  if (DstTy != LLT::vector(2, 16))
1596  break;
1597 
1598  assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
1599  substituteSimpleCopyRegs(OpdMapper, 1);
1600  substituteSimpleCopyRegs(OpdMapper, 2);
1601 
1602  const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1603  if (DstBank == &AMDGPU::SGPRRegBank)
1604  break; // Can use S_PACK_* instructions.
1605 
1606  MachineIRBuilder B(MI);
1607 
1608  Register Lo = MI.getOperand(1).getReg();
1609  Register Hi = MI.getOperand(2).getReg();
1610  const LLT S32 = LLT::scalar(32);
1611 
1612  const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI);
1613  const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI);
1614 
1615  Register ZextLo;
1616  Register ShiftHi;
1617 
1618  if (Opc == AMDGPU::G_BUILD_VECTOR) {
1619  ZextLo = B.buildZExt(S32, Lo).getReg(0);
1620  MRI.setRegBank(ZextLo, *BankLo);
1621 
1622  Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
1623  MRI.setRegBank(ZextHi, *BankHi);
1624 
1625  auto ShiftAmt = B.buildConstant(S32, 16);
1626  MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1627 
1628  ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
1629  MRI.setRegBank(ShiftHi, *BankHi);
1630  } else {
1631  Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
1632  MRI.setRegBank(MaskLo, *BankLo);
1633 
1634  auto ShiftAmt = B.buildConstant(S32, 16);
1635  MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1636 
1637  ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
1638  MRI.setRegBank(ShiftHi, *BankHi);
1639 
1640  ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
1641  MRI.setRegBank(ZextLo, *BankLo);
1642  }
1643 
1644  auto Or = B.buildOr(S32, ZextLo, ShiftHi);
1645  MRI.setRegBank(Or.getReg(0), *DstBank);
1646 
1647  B.buildBitcast(DstReg, Or);
1648  MI.eraseFromParent();
1649  return;
1650  }
1651  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
1652  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1653 
1654  assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
1655 
1656  if (DstRegs.empty()) {
1657  applyDefaultMapping(OpdMapper);
1658  executeInWaterfallLoop(MI, MRI, { 2 });
1659  return;
1660  }
1661 
1662  Register DstReg = MI.getOperand(0).getReg();
1663  Register SrcReg = MI.getOperand(1).getReg();
1664  Register IdxReg = MI.getOperand(2).getReg();
1665  LLT DstTy = MRI.getType(DstReg);
1666  (void)DstTy;
1667 
1668  assert(DstTy.getSizeInBits() == 64);
1669 
1670  LLT SrcTy = MRI.getType(SrcReg);
1671  const LLT S32 = LLT::scalar(32);
1672  LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
1673 
1674  MachineIRBuilder B(MI);
1675  auto CastSrc = B.buildBitcast(Vec32, SrcReg);
1676  auto One = B.buildConstant(S32, 1);
1677 
1678  // Split the vector index into 32-bit pieces. Prepare to move all of the
1679  // new instructions into a waterfall loop if necessary.
1680  //
1681  // Don't put the bitcast or constant in the loop.
1682  MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
1683 
1684  // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
1685  auto IdxLo = B.buildShl(S32, IdxReg, One);
1686  auto IdxHi = B.buildAdd(S32, IdxLo, One);
1687  B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
1688  B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
1689 
1690  const ValueMapping &DstMapping
1691  = OpdMapper.getInstrMapping().getOperandMapping(0);
1692 
1693  // FIXME: Should be getting from mapping or not?
1694  const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1695  MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank);
1696  MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
1697  MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
1698  MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
1699  MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
1700 
1701  SmallSet<Register, 4> OpsToWaterfall;
1702  if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
1703  MI.eraseFromParent();
1704  return;
1705  }
1706 
1707  // Remove the original instruction to avoid potentially confusing the
1708  // waterfall loop logic.
1709  B.setInstr(*Span.begin());
1710  MI.eraseFromParent();
1711  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1712  OpsToWaterfall, MRI);
1713  return;
1714  }
1715  case AMDGPU::G_INSERT_VECTOR_ELT: {
1716  SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
1717 
1718  assert(OpdMapper.getVRegs(0).empty());
1719  assert(OpdMapper.getVRegs(1).empty());
1720  assert(OpdMapper.getVRegs(3).empty());
1721 
1722  if (InsRegs.empty()) {
1723  applyDefaultMapping(OpdMapper);
1724  executeInWaterfallLoop(MI, MRI, { 3 });
1725  return;
1726  }
1727 
1728  Register DstReg = MI.getOperand(0).getReg();
1729  Register SrcReg = MI.getOperand(1).getReg();
1730  Register InsReg = MI.getOperand(2).getReg();
1731  Register IdxReg = MI.getOperand(3).getReg();
1732  LLT SrcTy = MRI.getType(SrcReg);
1733  LLT InsTy = MRI.getType(InsReg);
1734  (void)InsTy;
1735 
1736  assert(InsTy.getSizeInBits() == 64);
1737 
1738  const LLT S32 = LLT::scalar(32);
1739  LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
1740 
1741  MachineIRBuilder B(MI);
1742  auto CastSrc = B.buildBitcast(Vec32, SrcReg);
1743  auto One = B.buildConstant(S32, 1);
1744 
1745  // Split the vector index into 32-bit pieces. Prepare to move all of the
1746  // new instructions into a waterfall loop if necessary.
1747  //
1748  // Don't put the bitcast or constant in the loop.
1749  MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
1750 
1751  // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
1752  auto IdxLo = B.buildShl(S32, IdxReg, One);
1753  auto IdxHi = B.buildAdd(S32, IdxLo, One);
1754 
1755  auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
1756  auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
1757  B.buildBitcast(DstReg, InsHi);
1758 
1759  const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1760  const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1761  const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI);
1762 
1763  MRI.setRegBank(InsReg, *InsSrcBank);
1764  MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
1765  MRI.setRegBank(InsLo.getReg(0), *DstBank);
1766  MRI.setRegBank(InsHi.getReg(0), *DstBank);
1767  MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
1768  MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
1769  MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
1770 
1771 
1772  SmallSet<Register, 4> OpsToWaterfall;
1773  if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
1774  MI.eraseFromParent();
1775  return;
1776  }
1777 
1778  B.setInstr(*Span.begin());
1779  MI.eraseFromParent();
1780 
1781  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1782  OpsToWaterfall, MRI);
1783  return;
1784  }
1785  case AMDGPU::G_INTRINSIC: {
1786  switch (MI.getIntrinsicID()) {
1787  case Intrinsic::amdgcn_s_buffer_load: {
1788  // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
1789  executeInWaterfallLoop(MI, MRI, { 2, 3 });
1790  return;
1791  }
1792  case Intrinsic::amdgcn_readlane: {
1793  substituteSimpleCopyRegs(OpdMapper, 2);
1794 
1795  assert(OpdMapper.getVRegs(0).empty());
1796  assert(OpdMapper.getVRegs(3).empty());
1797 
1798  // Make sure the index is an SGPR. It doesn't make sense to run this in a
1799  // waterfall loop, so assume it's a uniform value.
1800  constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1801  return;
1802  }
1803  case Intrinsic::amdgcn_writelane: {
1804  assert(OpdMapper.getVRegs(0).empty());
1805  assert(OpdMapper.getVRegs(2).empty());
1806  assert(OpdMapper.getVRegs(3).empty());
1807 
1808  substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
1809  constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
1810  constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1811  return;
1812  }
1813  default:
1814  break;
1815  }
1816  break;
1817  }
1818  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
1819  auto IntrID = MI.getIntrinsicID();
1820  switch (IntrID) {
1821  case Intrinsic::amdgcn_buffer_load: {
1822  executeInWaterfallLoop(MI, MRI, { 2 });
1823  return;
1824  }
1825  case Intrinsic::amdgcn_ds_ordered_add:
1826  case Intrinsic::amdgcn_ds_ordered_swap: {
1827  // This is only allowed to execute with 1 lane, so readfirstlane is safe.
1828  assert(OpdMapper.getVRegs(0).empty());
1829  substituteSimpleCopyRegs(OpdMapper, 3);
1830  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1831  return;
1832  }
1833  case Intrinsic::amdgcn_ds_gws_init:
1834  case Intrinsic::amdgcn_ds_gws_barrier:
1835  case Intrinsic::amdgcn_ds_gws_sema_br: {
1836  // Only the first lane is executes, so readfirstlane is safe.
1837  substituteSimpleCopyRegs(OpdMapper, 1);
1838  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1839  return;
1840  }
1841  case Intrinsic::amdgcn_ds_gws_sema_v:
1842  case Intrinsic::amdgcn_ds_gws_sema_p:
1843  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1844  // Only the first lane is executes, so readfirstlane is safe.
1845  constrainOpWithReadfirstlane(MI, MRI, 1); // M0
1846  return;
1847  }
1848  case Intrinsic::amdgcn_s_sendmsg:
1849  case Intrinsic::amdgcn_s_sendmsghalt: {
1850  // FIXME: Should this use a waterfall loop?
1851  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1852  return;
1853  }
1854  case Intrinsic::amdgcn_raw_buffer_load:
1855  case Intrinsic::amdgcn_raw_buffer_load_format:
1856  case Intrinsic::amdgcn_raw_tbuffer_load:
1857  case Intrinsic::amdgcn_raw_buffer_store:
1858  case Intrinsic::amdgcn_raw_buffer_store_format:
1859  case Intrinsic::amdgcn_raw_tbuffer_store: {
1860  applyDefaultMapping(OpdMapper);
1861  executeInWaterfallLoop(MI, MRI, {2, 4});
1862  return;
1863  }
1864  case Intrinsic::amdgcn_struct_buffer_load:
1865  case Intrinsic::amdgcn_struct_buffer_store:
1866  case Intrinsic::amdgcn_struct_tbuffer_load:
1867  case Intrinsic::amdgcn_struct_tbuffer_store: {
1868  applyDefaultMapping(OpdMapper);
1869  executeInWaterfallLoop(MI, MRI, {2, 5});
1870  return;
1871  }
1872  default: {
1873  if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1874  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1875  // Non-images can have complications from operands that allow both SGPR
1876  // and VGPR. For now it's too complicated to figure out the final opcode
1877  // to derive the register bank from the MCInstrDesc.
1878  if (RSrcIntrin->IsImage) {
1879  applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
1880  return;
1881  }
1882  }
1883 
1884  break;
1885  }
1886  }
1887  break;
1888  }
1889  case AMDGPU::G_LOAD:
1890  case AMDGPU::G_ZEXTLOAD:
1891  case AMDGPU::G_SEXTLOAD: {
1892  if (applyMappingWideLoad(MI, OpdMapper, MRI))
1893  return;
1894  break;
1895  }
1896  default:
1897  break;
1898  }
1899 
1900  return applyDefaultMapping(OpdMapper);
1901 }
1902 
1903 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
1904  const MachineFunction &MF = *MI.getParent()->getParent();
1905  const MachineRegisterInfo &MRI = MF.getRegInfo();
1906  for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
1907  if (!MI.getOperand(i).isReg())
1908  continue;
1909  Register Reg = MI.getOperand(i).getReg();
1910  if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
1911  if (Bank->getID() == AMDGPU::VGPRRegBankID)
1912  return false;
1913 
1914  assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
1915  Bank->getID() == AMDGPU::SCCRegBankID);
1916  }
1917  }
1918  return true;
1919 }
1920 
1922 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
1923  const MachineFunction &MF = *MI.getParent()->getParent();
1924  const MachineRegisterInfo &MRI = MF.getRegInfo();
1926 
1927  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1928  unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
1929  unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
1930  OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
1931  }
1932  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1933  MI.getNumOperands());
1934 }
1935 
1937 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
1938  const MachineFunction &MF = *MI.getParent()->getParent();
1939  const MachineRegisterInfo &MRI = MF.getRegInfo();
1941  unsigned OpdIdx = 0;
1942 
1943  unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1944  OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
1945 
1946  if (MI.getOperand(OpdIdx).isIntrinsicID())
1947  OpdsMapping[OpdIdx++] = nullptr;
1948 
1949  Register Reg1 = MI.getOperand(OpdIdx).getReg();
1950  unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
1951 
1952  unsigned DefaultBankID = Size1 == 1 ?
1953  AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1954  unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
1955 
1956  OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
1957 
1958  for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
1959  const MachineOperand &MO = MI.getOperand(OpdIdx);
1960  if (!MO.isReg())
1961  continue;
1962 
1963  unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
1964  unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1965  OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
1966  }
1967 
1968  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1969  MI.getNumOperands());
1970 }
1971 
1973 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
1974  const MachineFunction &MF = *MI.getParent()->getParent();
1975  const MachineRegisterInfo &MRI = MF.getRegInfo();
1977 
1978  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1979  const MachineOperand &Op = MI.getOperand(I);
1980  if (!Op.isReg())
1981  continue;
1982 
1983  unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
1984  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1985  }
1986 
1987  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1988  MI.getNumOperands());
1989 }
1990 
1992 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
1993  const MachineInstr &MI,
1994  int RsrcIdx) const {
1995  // The reported argument index is relative to the IR intrinsic call arguments,
1996  // so we need to shift by the number of defs and the intrinsic ID.
1997  RsrcIdx += MI.getNumExplicitDefs() + 1;
1998 
1999  const int NumOps = MI.getNumOperands();
2000  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
2001 
2002  // TODO: Should packed/unpacked D16 difference be reported here as part of
2003  // the value mapping?
2004  for (int I = 0; I != NumOps; ++I) {
2005  if (!MI.getOperand(I).isReg())
2006  continue;
2007 
2008  Register OpReg = MI.getOperand(I).getReg();
2009  unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
2010 
2011  // FIXME: Probably need a new intrinsic register bank searchable table to
2012  // handle arbitrary intrinsics easily.
2013  //
2014  // If this has a sampler, it immediately follows rsrc.
2015  const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
2016 
2017  if (MustBeSGPR) {
2018  // If this must be an SGPR, so we must report whatever it is as legal.
2019  unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2020  OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
2021  } else {
2022  // Some operands must be VGPR, and these are easy to copy to.
2023  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2024  }
2025  }
2026 
2027  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
2028 }
2029 
2031 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
2032 
2033  const MachineFunction &MF = *MI.getParent()->getParent();
2034  const MachineRegisterInfo &MRI = MF.getRegInfo();
2035  SmallVector<const ValueMapping*, 2> OpdsMapping(2);
2036  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2037  LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
2038  Register PtrReg = MI.getOperand(1).getReg();
2039  LLT PtrTy = MRI.getType(PtrReg);
2040  unsigned AS = PtrTy.getAddressSpace();
2041  unsigned PtrSize = PtrTy.getSizeInBits();
2042 
2043  const ValueMapping *ValMapping;
2044  const ValueMapping *PtrMapping;
2045 
2046  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
2047 
2048  if (PtrBank == &AMDGPU::SGPRRegBank &&
2050  AS != AMDGPUAS::PRIVATE_ADDRESS) &&
2052  // We have a uniform instruction so we want to use an SMRD load
2053  ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2054  PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
2055  } else {
2056  ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
2057  PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
2058  }
2059 
2060  OpdsMapping[0] = ValMapping;
2061  OpdsMapping[1] = PtrMapping;
2063  1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
2064  return Mapping;
2065 
2066  // FIXME: Do we want to add a mapping for FLAT load, or should we just
2067  // handle that during instruction selection?
2068 }
2069 
2070 unsigned
2071 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
2072  const MachineRegisterInfo &MRI,
2073  const TargetRegisterInfo &TRI,
2074  unsigned Default) const {
2075 
2076  const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
2077  return Bank ? Bank->getID() : Default;
2078 }
2079 
2080 
2081 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
2082  return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
2083  AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2084 }
2085 
2087 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
2088  const MachineRegisterInfo &MRI,
2089  const TargetRegisterInfo &TRI) const {
2090  // Lie and claim anything is legal, even though this needs to be an SGPR
2091  // applyMapping will have to deal with it as a waterfall loop.
2092  unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
2093  unsigned Size = getSizeInBits(Reg, MRI, TRI);
2094  return AMDGPU::getValueMapping(Bank, Size);
2095 }
2096 
2098 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
2099  const MachineRegisterInfo &MRI,
2100  const TargetRegisterInfo &TRI) const {
2101  unsigned Size = getSizeInBits(Reg, MRI, TRI);
2102  return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2103 }
2104 
2105 ///
2106 /// This function must return a legal mapping, because
2107 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
2108 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
2109 /// VGPR to SGPR generated is illegal.
2110 ///
2113  const MachineFunction &MF = *MI.getParent()->getParent();
2114  const MachineRegisterInfo &MRI = MF.getRegInfo();
2115 
2116  if (MI.isRegSequence()) {
2117  // If any input is a VGPR, the result must be a VGPR. The default handling
2118  // assumes any copy between banks is legal.
2119  unsigned BankID = AMDGPU::SGPRRegBankID;
2120 
2121  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2122  auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
2123  // It doesn't make sense to use vcc or scc banks here, so just ignore
2124  // them.
2125  if (OpBank != AMDGPU::SGPRRegBankID) {
2126  BankID = AMDGPU::VGPRRegBankID;
2127  break;
2128  }
2129  }
2130  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2131 
2132  const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
2133  return getInstructionMapping(
2134  1, /*Cost*/ 1,
2135  /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
2136  }
2137 
2138  // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
2139  // properly.
2140  //
2141  // TODO: There are additional exec masking dependencies to analyze.
2142  if (MI.getOpcode() == TargetOpcode::G_PHI) {
2143  // TODO: Generate proper invalid bank enum.
2144  int ResultBank = -1;
2145 
2146  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2147  Register Reg = MI.getOperand(I).getReg();
2148  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
2149 
2150  // FIXME: Assuming VGPR for any undetermined inputs.
2151  if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
2152  ResultBank = AMDGPU::VGPRRegBankID;
2153  break;
2154  }
2155 
2156  unsigned OpBank = Bank->getID();
2157  // scc, scc -> sgpr
2158  if (OpBank == AMDGPU::SCCRegBankID) {
2159  // There's only one SCC register, so a phi requires copying to SGPR.
2160  OpBank = AMDGPU::SGPRRegBankID;
2161  } else if (OpBank == AMDGPU::VCCRegBankID) {
2162  // vcc, vcc -> vcc
2163  // vcc, sgpr -> vgpr
2164  if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) {
2165  ResultBank = AMDGPU::VGPRRegBankID;
2166  break;
2167  }
2168  }
2169 
2170  ResultBank = OpBank;
2171  }
2172 
2173  assert(ResultBank != -1);
2174 
2175  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2176 
2177  const ValueMapping &ValMap =
2178  getValueMapping(0, Size, getRegBank(ResultBank));
2179  return getInstructionMapping(
2180  1, /*Cost*/ 1,
2181  /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
2182  }
2183 
2185  if (Mapping.isValid())
2186  return Mapping;
2187 
2189 
2190  switch (MI.getOpcode()) {
2191  default:
2193 
2194  case AMDGPU::G_AND:
2195  case AMDGPU::G_OR:
2196  case AMDGPU::G_XOR: {
2197  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2198  if (Size == 1) {
2199  const RegisterBank *DstBank
2200  = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
2201 
2202  unsigned TargetBankID = -1;
2203  unsigned BankLHS = -1;
2204  unsigned BankRHS = -1;
2205  if (DstBank) {
2206  TargetBankID = DstBank->getID();
2207  if (DstBank == &AMDGPU::VCCRegBank) {
2208  TargetBankID = AMDGPU::VCCRegBankID;
2209  BankLHS = AMDGPU::VCCRegBankID;
2210  BankRHS = AMDGPU::VCCRegBankID;
2211  } else if (DstBank == &AMDGPU::SCCRegBank) {
2212  TargetBankID = AMDGPU::SCCRegBankID;
2213  BankLHS = AMDGPU::SGPRRegBankID;
2214  BankRHS = AMDGPU::SGPRRegBankID;
2215  } else {
2216  BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2217  AMDGPU::SGPRRegBankID);
2218  BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2219  AMDGPU::SGPRRegBankID);
2220  }
2221  } else {
2222  BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2223  AMDGPU::VCCRegBankID);
2224  BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2225  AMDGPU::VCCRegBankID);
2226 
2227  // Both inputs should be true booleans to produce a boolean result.
2228  if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
2229  TargetBankID = AMDGPU::VGPRRegBankID;
2230  } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
2231  TargetBankID = AMDGPU::VCCRegBankID;
2232  BankLHS = AMDGPU::VCCRegBankID;
2233  BankRHS = AMDGPU::VCCRegBankID;
2234  } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
2235  TargetBankID = AMDGPU::SGPRRegBankID;
2236  } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
2237  // The operation must be done on a 32-bit register, but it will set
2238  // scc. The result type could interchangably be SCC or SGPR, since
2239  // both values will be produced.
2240  TargetBankID = AMDGPU::SCCRegBankID;
2241  BankLHS = AMDGPU::SGPRRegBankID;
2242  BankRHS = AMDGPU::SGPRRegBankID;
2243  }
2244  }
2245 
2246  OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
2247  OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
2248  OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
2249  break;
2250  }
2251 
2252  if (Size == 64) {
2253 
2254  if (isSALUMapping(MI)) {
2255  OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
2256  OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
2257  } else {
2258  OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
2259  unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
2260  OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
2261 
2262  unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
2263  OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
2264  }
2265 
2266  break;
2267  }
2268 
2270  }
2271  case AMDGPU::G_GEP:
2272  case AMDGPU::G_ADD:
2273  case AMDGPU::G_SUB:
2274  case AMDGPU::G_MUL:
2275  case AMDGPU::G_SHL:
2276  case AMDGPU::G_LSHR:
2277  case AMDGPU::G_ASHR:
2278  case AMDGPU::G_UADDO:
2279  case AMDGPU::G_USUBO:
2280  case AMDGPU::G_UADDE:
2281  case AMDGPU::G_SADDE:
2282  case AMDGPU::G_USUBE:
2283  case AMDGPU::G_SSUBE:
2284  case AMDGPU::G_SMIN:
2285  case AMDGPU::G_SMAX:
2286  case AMDGPU::G_UMIN:
2287  case AMDGPU::G_UMAX:
2288  if (isSALUMapping(MI))
2289  return getDefaultMappingSOP(MI);
2291 
2292  case AMDGPU::G_FADD:
2293  case AMDGPU::G_FSUB:
2294  case AMDGPU::G_FPTOSI:
2295  case AMDGPU::G_FPTOUI:
2296  case AMDGPU::G_FMUL:
2297  case AMDGPU::G_FMA:
2298  case AMDGPU::G_FMAD:
2299  case AMDGPU::G_FSQRT:
2300  case AMDGPU::G_FFLOOR:
2301  case AMDGPU::G_FCEIL:
2302  case AMDGPU::G_FRINT:
2303  case AMDGPU::G_SITOFP:
2304  case AMDGPU::G_UITOFP:
2305  case AMDGPU::G_FPTRUNC:
2306  case AMDGPU::G_FPEXT:
2307  case AMDGPU::G_FEXP2:
2308  case AMDGPU::G_FLOG2:
2309  case AMDGPU::G_FMINNUM:
2310  case AMDGPU::G_FMAXNUM:
2311  case AMDGPU::G_FMINNUM_IEEE:
2312  case AMDGPU::G_FMAXNUM_IEEE:
2313  case AMDGPU::G_FCANONICALIZE:
2314  case AMDGPU::G_INTRINSIC_TRUNC:
2315  case AMDGPU::G_INTRINSIC_ROUND:
2316  case AMDGPU::G_AMDGPU_FFBH_U32:
2317  return getDefaultMappingVOP(MI);
2318  case AMDGPU::G_UMULH:
2319  case AMDGPU::G_SMULH: {
2320  if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
2321  return getDefaultMappingSOP(MI);
2322  return getDefaultMappingVOP(MI);
2323  }
2324  case AMDGPU::G_IMPLICIT_DEF: {
2325  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2326  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2327  break;
2328  }
2329  case AMDGPU::G_FCONSTANT:
2330  case AMDGPU::G_CONSTANT:
2331  case AMDGPU::G_GLOBAL_VALUE:
2332  case AMDGPU::G_BLOCK_ADDR: {
2333  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2334  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2335  break;
2336  }
2337  case AMDGPU::G_FRAME_INDEX: {
2338  // TODO: This should be the same as other constants, but eliminateFrameIndex
2339  // currently assumes VALU uses.
2340  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2341  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2342  break;
2343  }
2344  case AMDGPU::G_INSERT: {
2345  unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2346  AMDGPU::VGPRRegBankID;
2347  unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2348  unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2349  unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
2350  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2351  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2352  OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
2353  OpdsMapping[3] = nullptr;
2354  break;
2355  }
2356  case AMDGPU::G_EXTRACT: {
2357  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2358  unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2359  unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2360  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2361  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2362  OpdsMapping[2] = nullptr;
2363  break;
2364  }
2365  case AMDGPU::G_BUILD_VECTOR:
2366  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2367  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
2368  if (DstTy == LLT::vector(2, 16)) {
2369  unsigned DstSize = DstTy.getSizeInBits();
2370  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2371  unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2372  unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2373  unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
2374 
2375  OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
2376  OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
2377  OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
2378  break;
2379  }
2380 
2382  }
2383  case AMDGPU::G_MERGE_VALUES:
2384  case AMDGPU::G_CONCAT_VECTORS: {
2385  unsigned Bank = isSALUMapping(MI) ?
2386  AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2387  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2388  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2389 
2390  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2391  // Op1 and Dst should use the same register bank.
2392  for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
2393  OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
2394  break;
2395  }
2396  case AMDGPU::G_BITCAST:
2397  case AMDGPU::G_INTTOPTR:
2398  case AMDGPU::G_PTRTOINT:
2399  case AMDGPU::G_CTLZ:
2400  case AMDGPU::G_CTLZ_ZERO_UNDEF:
2401  case AMDGPU::G_CTTZ:
2402  case AMDGPU::G_CTTZ_ZERO_UNDEF:
2403  case AMDGPU::G_CTPOP:
2404  case AMDGPU::G_BSWAP:
2405  case AMDGPU::G_BITREVERSE:
2406  case AMDGPU::G_FABS:
2407  case AMDGPU::G_FNEG: {
2408  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2409  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2410  OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
2411  break;
2412  }
2413  case AMDGPU::G_TRUNC: {
2414  Register Dst = MI.getOperand(0).getReg();
2415  Register Src = MI.getOperand(1).getReg();
2416  unsigned Bank = getRegBankID(Src, MRI, *TRI);
2417  unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2418  unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2419  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2420  OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
2421  break;
2422  }
2423  case AMDGPU::G_ZEXT:
2424  case AMDGPU::G_SEXT:
2425  case AMDGPU::G_ANYEXT: {
2426  Register Dst = MI.getOperand(0).getReg();
2427  Register Src = MI.getOperand(1).getReg();
2428  unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2429  unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2430 
2431  unsigned DstBank;
2432  const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
2433  assert(SrcBank);
2434  switch (SrcBank->getID()) {
2435  case AMDGPU::SCCRegBankID:
2436  case AMDGPU::SGPRRegBankID:
2437  DstBank = AMDGPU::SGPRRegBankID;
2438  break;
2439  default:
2440  DstBank = AMDGPU::VGPRRegBankID;
2441  break;
2442  }
2443 
2444  // TODO: Should anyext be split into 32-bit part as well?
2445  if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
2446  OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
2447  OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
2448  } else {
2449  // Scalar extend can use 64-bit BFE, but VGPRs require extending to
2450  // 32-bits, and then to 64.
2451  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
2452  OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
2453  SrcSize);
2454  }
2455  break;
2456  }
2457  case AMDGPU::G_FCMP: {
2458  unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2459  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2460  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2461  OpdsMapping[1] = nullptr; // Predicate Operand.
2462  OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2463  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2464  break;
2465  }
2466  case AMDGPU::G_STORE: {
2467  assert(MI.getOperand(0).isReg());
2468  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2469  // FIXME: We need to specify a different reg bank once scalar stores
2470  // are supported.
2471  const ValueMapping *ValMapping =
2472  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2473  // FIXME: Depending on the type of store, the pointer could be in
2474  // the SGPR Reg bank.
2475  // FIXME: Pointer size should be based on the address space.
2476  const ValueMapping *PtrMapping =
2477  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
2478 
2479  OpdsMapping[0] = ValMapping;
2480  OpdsMapping[1] = PtrMapping;
2481  break;
2482  }
2483 
2484  case AMDGPU::G_ICMP: {
2485  auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
2486  unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2487  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2488  unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2489 
2490  bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
2491  Op3Bank == AMDGPU::SGPRRegBankID &&
2492  (Size == 32 || (Size == 64 &&
2493  (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
2494  Subtarget.hasScalarCompareEq64()));
2495 
2496  unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2497 
2498  OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
2499  OpdsMapping[1] = nullptr; // Predicate Operand.
2500  OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2501  OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
2502  break;
2503  }
2504  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2505  // VGPR index can be used for waterfall when indexing a SGPR vector.
2506  unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2507  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2508  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2509  unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2510  unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2511  unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
2512 
2513  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
2514  OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
2515 
2516  // The index can be either if the source vector is VGPR.
2517  OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2518  break;
2519  }
2520  case AMDGPU::G_INSERT_VECTOR_ELT: {
2521  unsigned OutputBankID = isSALUMapping(MI) ?
2522  AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2523 
2524  unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2525  unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2526  unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2527  unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2528  unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
2529  MRI, *TRI);
2530  unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2531 
2532  OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
2533  OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
2534  OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
2535  InsertSize);
2536 
2537  // The index can be either if the source vector is VGPR.
2538  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
2539  break;
2540  }
2541  case AMDGPU::G_UNMERGE_VALUES: {
2542  unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2543  AMDGPU::VGPRRegBankID;
2544 
2545  // Op1 and Dst should use the same register bank.
2546  // FIXME: Shouldn't this be the default? Why do we need to handle this?
2547  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2548  unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2549  OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
2550  }
2551  break;
2552  }
2553  case AMDGPU::G_INTRINSIC: {
2554  switch (MI.getIntrinsicID()) {
2555  default:
2557  case Intrinsic::amdgcn_div_fmas:
2558  case Intrinsic::amdgcn_trig_preop:
2559  case Intrinsic::amdgcn_sin:
2560  case Intrinsic::amdgcn_cos:
2561  case Intrinsic::amdgcn_log_clamp:
2562  case Intrinsic::amdgcn_rcp:
2563  case Intrinsic::amdgcn_rcp_legacy:
2564  case Intrinsic::amdgcn_rsq:
2565  case Intrinsic::amdgcn_rsq_legacy:
2566  case Intrinsic::amdgcn_rsq_clamp:
2567  case Intrinsic::amdgcn_ldexp:
2568  case Intrinsic::amdgcn_frexp_mant:
2569  case Intrinsic::amdgcn_frexp_exp:
2570  case Intrinsic::amdgcn_fract:
2571  case Intrinsic::amdgcn_cvt_pkrtz:
2572  case Intrinsic::amdgcn_cvt_pknorm_i16:
2573  case Intrinsic::amdgcn_cvt_pknorm_u16:
2574  case Intrinsic::amdgcn_cvt_pk_i16:
2575  case Intrinsic::amdgcn_cvt_pk_u16:
2576  case Intrinsic::amdgcn_fmed3:
2577  case Intrinsic::amdgcn_cubeid:
2578  case Intrinsic::amdgcn_cubema:
2579  case Intrinsic::amdgcn_cubesc:
2580  case Intrinsic::amdgcn_cubetc:
2581  case Intrinsic::amdgcn_sffbh:
2582  case Intrinsic::amdgcn_fmad_ftz:
2583  case Intrinsic::amdgcn_mbcnt_lo:
2584  case Intrinsic::amdgcn_mbcnt_hi:
2585  case Intrinsic::amdgcn_ubfe:
2586  case Intrinsic::amdgcn_sbfe:
2587  case Intrinsic::amdgcn_mul_u24:
2588  case Intrinsic::amdgcn_mul_i24:
2589  case Intrinsic::amdgcn_lerp:
2590  case Intrinsic::amdgcn_sad_u8:
2591  case Intrinsic::amdgcn_msad_u8:
2592  case Intrinsic::amdgcn_sad_hi_u8:
2593  case Intrinsic::amdgcn_sad_u16:
2594  case Intrinsic::amdgcn_qsad_pk_u16_u8:
2595  case Intrinsic::amdgcn_mqsad_pk_u16_u8:
2596  case Intrinsic::amdgcn_mqsad_u32_u8:
2597  case Intrinsic::amdgcn_cvt_pk_u8_f32:
2598  case Intrinsic::amdgcn_alignbit:
2599  case Intrinsic::amdgcn_alignbyte:
2600  case Intrinsic::amdgcn_fdot2:
2601  case Intrinsic::amdgcn_sdot2:
2602  case Intrinsic::amdgcn_udot2:
2603  case Intrinsic::amdgcn_sdot4:
2604  case Intrinsic::amdgcn_udot4:
2605  case Intrinsic::amdgcn_sdot8:
2606  case Intrinsic::amdgcn_udot8:
2607  case Intrinsic::amdgcn_wwm:
2608  case Intrinsic::amdgcn_wqm:
2609  return getDefaultMappingVOP(MI);
2610  case Intrinsic::amdgcn_ds_swizzle:
2611  case Intrinsic::amdgcn_ds_permute:
2612  case Intrinsic::amdgcn_ds_bpermute:
2613  case Intrinsic::amdgcn_update_dpp:
2614  return getDefaultMappingAllVGPR(MI);
2615  case Intrinsic::amdgcn_kernarg_segment_ptr:
2616  case Intrinsic::amdgcn_s_getpc:
2617  case Intrinsic::amdgcn_groupstaticsize: {
2618  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2619  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2620  break;
2621  }
2622  case Intrinsic::amdgcn_wqm_vote: {
2623  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2624  OpdsMapping[0] = OpdsMapping[2]
2625  = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
2626  break;
2627  }
2628  case Intrinsic::amdgcn_s_buffer_load: {
2629  // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
2630  Register RSrc = MI.getOperand(2).getReg(); // SGPR
2631  Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
2632 
2633  unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2634  unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2635  unsigned Size3 = MRI.getType(Offset).getSizeInBits();
2636 
2637  unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2638  unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2639 
2640  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
2641  OpdsMapping[1] = nullptr; // intrinsic id
2642 
2643  // Lie and claim everything is legal, even though some need to be
2644  // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2645  OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2646  OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
2647  OpdsMapping[4] = nullptr;
2648  break;
2649  }
2650  case Intrinsic::amdgcn_div_scale: {
2651  unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2652  unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2653  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
2654  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
2655 
2656  unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2657  OpdsMapping[3] = AMDGPU::getValueMapping(
2658  getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
2659  OpdsMapping[4] = AMDGPU::getValueMapping(
2660  getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
2661 
2662  break;
2663  }
2664  case Intrinsic::amdgcn_class: {
2665  Register Src0Reg = MI.getOperand(2).getReg();
2666  Register Src1Reg = MI.getOperand(3).getReg();
2667  unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
2668  unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
2669  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2670  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
2671  OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
2672  Src0Size);
2673  OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
2674  Src1Size);
2675  break;
2676  }
2677  case Intrinsic::amdgcn_icmp:
2678  case Intrinsic::amdgcn_fcmp: {
2679  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2680  // This is not VCCRegBank because this is not used in boolean contexts.
2681  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2682  unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2683  unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2684  unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2685  OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
2686  OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
2687  break;
2688  }
2689  case Intrinsic::amdgcn_readlane: {
2690  // This must be an SGPR, but accept a VGPR.
2691  Register IdxReg = MI.getOperand(3).getReg();
2692  unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2693  unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2694  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2696  }
2697  case Intrinsic::amdgcn_readfirstlane: {
2698  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2699  unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2700  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2701  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2702  break;
2703  }
2704  case Intrinsic::amdgcn_writelane: {
2705  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2706  Register SrcReg = MI.getOperand(2).getReg();
2707  unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
2708  unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2709  Register IdxReg = MI.getOperand(3).getReg();
2710  unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2711  unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2712  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2713 
2714  // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
2715  // to legalize.
2716  OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
2717  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2718  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2719  break;
2720  }
2721  case Intrinsic::amdgcn_if_break: {
2722  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2723  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2724  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2725  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2726  break;
2727  }
2728  }
2729  break;
2730  }
2731  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2732  auto IntrID = MI.getIntrinsicID();
2733  switch (IntrID) {
2734  case Intrinsic::amdgcn_s_getreg:
2735  case Intrinsic::amdgcn_s_memtime:
2736  case Intrinsic::amdgcn_s_memrealtime:
2737  case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
2738  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2739  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2740  break;
2741  }
2742  case Intrinsic::amdgcn_ds_append:
2743  case Intrinsic::amdgcn_ds_consume:
2744  case Intrinsic::amdgcn_ds_fadd:
2745  case Intrinsic::amdgcn_ds_fmin:
2746  case Intrinsic::amdgcn_ds_fmax:
2747  case Intrinsic::amdgcn_atomic_inc:
2748  case Intrinsic::amdgcn_atomic_dec:
2749  return getDefaultMappingAllVGPR(MI);
2750  case Intrinsic::amdgcn_ds_ordered_add:
2751  case Intrinsic::amdgcn_ds_ordered_swap: {
2752  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2753  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2754  unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2755  AMDGPU::SGPRRegBankID);
2756  OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
2757  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2758  break;
2759  }
2760  case Intrinsic::amdgcn_exp_compr:
2761  OpdsMapping[0] = nullptr; // IntrinsicID
2762  // FIXME: These are immediate values which can't be read from registers.
2763  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2764  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2765  // FIXME: Could we support packed types here?
2766  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2767  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2768  // FIXME: These are immediate values which can't be read from registers.
2769  OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2770  OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2771  break;
2772  case Intrinsic::amdgcn_exp:
2773  // FIXME: Could we support packed types here?
2774  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2775  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2776  OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2777  OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2778  break;
2779  case Intrinsic::amdgcn_buffer_load: {
2780  Register RSrc = MI.getOperand(2).getReg(); // SGPR
2781  Register VIndex = MI.getOperand(3).getReg(); // VGPR
2782  Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
2783 
2784  unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2785  unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2786  unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
2787  unsigned Size4 = MRI.getType(Offset).getSizeInBits();
2788 
2789  unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2790  unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2791 
2792  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
2793  OpdsMapping[1] = nullptr; // intrinsic id
2794 
2795  // Lie and claim everything is legal, even though some need to be
2796  // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2797  OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2798  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
2799  OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
2800  OpdsMapping[5] = nullptr;
2801  OpdsMapping[6] = nullptr;
2802  break;
2803  }
2804  case Intrinsic::amdgcn_s_sendmsg:
2805  case Intrinsic::amdgcn_s_sendmsghalt: {
2806  // This must be an SGPR, but accept a VGPR.
2807  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2808  AMDGPU::SGPRRegBankID);
2809  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
2810  break;
2811  }
2812  case Intrinsic::amdgcn_end_cf:
2813  case Intrinsic::amdgcn_init_exec: {
2814  unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2815  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2816  break;
2817  }
2818  case Intrinsic::amdgcn_else: {
2819  unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2820  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2821  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
2822  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
2823  break;
2824  }
2825  case Intrinsic::amdgcn_kill: {
2826  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2827  break;
2828  }
2829  case Intrinsic::amdgcn_raw_buffer_load:
2830  case Intrinsic::amdgcn_raw_tbuffer_load: {
2831  // FIXME: Should make intrinsic ID the last operand of the instruction,
2832  // then this would be the same as store
2833  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
2834  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2835  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2836  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2837  break;
2838  }
2839  case Intrinsic::amdgcn_raw_buffer_store:
2840  case Intrinsic::amdgcn_raw_buffer_store_format:
2841  case Intrinsic::amdgcn_raw_tbuffer_store: {
2842  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
2843  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2844  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2845  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2846  break;
2847  }
2848  case Intrinsic::amdgcn_struct_buffer_load:
2849  case Intrinsic::amdgcn_struct_tbuffer_load: {
2850  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
2851  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2852  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2853  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2854  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
2855  break;
2856  }
2857  case Intrinsic::amdgcn_struct_buffer_store:
2858  case Intrinsic::amdgcn_struct_tbuffer_store: {
2859  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
2860  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
2861  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
2862  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
2863  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
2864  break;
2865  }
2866  case Intrinsic::amdgcn_init_exec_from_input: {
2867  unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2868  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2869  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2870  break;
2871  }
2872  case Intrinsic::amdgcn_ds_gws_init:
2873  case Intrinsic::amdgcn_ds_gws_barrier:
2874  case Intrinsic::amdgcn_ds_gws_sema_br: {
2875  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2876 
2877  // This must be an SGPR, but accept a VGPR.
2878  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2879  AMDGPU::SGPRRegBankID);
2880  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
2881  break;
2882  }
2883  case Intrinsic::amdgcn_ds_gws_sema_v:
2884  case Intrinsic::amdgcn_ds_gws_sema_p:
2885  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2886  // This must be an SGPR, but accept a VGPR.
2887  unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2888  AMDGPU::SGPRRegBankID);
2889  OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
2890  break;
2891  }
2892  default:
2893  if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
2894  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
2895  // Non-images can have complications from operands that allow both SGPR
2896  // and VGPR. For now it's too complicated to figure out the final opcode
2897  // to derive the register bank from the MCInstrDesc.
2898  if (RSrcIntrin->IsImage)
2899  return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
2900  }
2901 
2903  }
2904  break;
2905  }
2906  case AMDGPU::G_SELECT: {
2907  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2908  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2909  AMDGPU::SGPRRegBankID);
2910  unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
2911  AMDGPU::SGPRRegBankID);
2912  bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
2913  Op3Bank == AMDGPU::SGPRRegBankID;
2914 
2915  unsigned CondBankDefault = SGPRSrcs ?
2916  AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2917  unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2918  CondBankDefault);
2919  if (CondBank == AMDGPU::SGPRRegBankID)
2920  CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2921  else if (CondBank == AMDGPU::VGPRRegBankID)
2922  CondBank = AMDGPU::VCCRegBankID;
2923 
2924  unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ?
2925  AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2926 
2927  assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID);
2928 
2929  if (Size == 64) {
2930  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2931  OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2932  OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2933  OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2934  } else {
2935  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
2936  OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2937  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
2938  OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
2939  }
2940 
2941  break;
2942  }
2943 
2944  case AMDGPU::G_LOAD:
2945  case AMDGPU::G_ZEXTLOAD:
2946  case AMDGPU::G_SEXTLOAD:
2947  return getInstrMappingForLoad(MI);
2948 
2949  case AMDGPU::G_ATOMICRMW_XCHG:
2950  case AMDGPU::G_ATOMICRMW_ADD:
2951  case AMDGPU::G_ATOMICRMW_SUB:
2952  case AMDGPU::G_ATOMICRMW_AND:
2953  case AMDGPU::G_ATOMICRMW_OR:
2954  case AMDGPU::G_ATOMICRMW_XOR:
2955  case AMDGPU::G_ATOMICRMW_MAX:
2956  case AMDGPU::G_ATOMICRMW_MIN:
2957  case AMDGPU::G_ATOMICRMW_UMAX:
2958  case AMDGPU::G_ATOMICRMW_UMIN:
2959  case AMDGPU::G_ATOMICRMW_FADD:
2960  case AMDGPU::G_ATOMIC_CMPXCHG: {
2961  return getDefaultMappingAllVGPR(MI);
2962  }
2963  case AMDGPU::G_BRCOND: {
2964  unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
2965  AMDGPU::SGPRRegBankID);
2966  assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
2967  if (Bank != AMDGPU::SCCRegBankID)
2968  Bank = AMDGPU::VCCRegBankID;
2969 
2970  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
2971  break;
2972  }
2973  }
2974 
2975  return getInstructionMapping(/*ID*/1, /*Cost*/1,
2976  getOperandsMapping(OpdsMapping),
2977  MI.getNumOperands());
2978 }
const InstructionMapping & getInstrMapping(const MachineInstr &MI) const override
This function must return a legal mapping, because AMDGPURegisterBankInfo::getInstrAlternativeMapping...
uint64_t CallInst * C
virtual MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val)
Build and insert Res = G_CONSTANT Val.
Interface definition for SIRegisterInfo.
const ValueMapping * getOperandsMapping(Iterator Begin, Iterator End) const
Get the uniquely generated array of ValueMapping for the elements of between Begin and End...
MachineInstrBuilder buildZExtOrTrunc(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ZEXT Op, Res = G_TRUNC Op, or Res = COPY Op depending on the differing sizes...
MachineInstrBuilder buildUnmerge(ArrayRef< LLT > Res, const SrcOp &Op)
Build and insert Res0, ...
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
AMDGPU specific subclass of TargetSubtarget.
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:139
This class represents lattice values for constants.
Definition: AllocatorList.h:23
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
Definition: MachineInstr.h:509
const InstructionMapping & getInstructionMapping(unsigned ID, unsigned Cost, const ValueMapping *OperandsMapping, unsigned NumOperands) const
Method to get a uniquely generated InstructionMapping.
MachineInstrBuilder buildSExtOrTrunc(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_SEXT Op, Res = G_TRUNC Op, or Res = COPY Op depending on the differing sizes...
Register getReg(unsigned Idx) const
Get the register for the operand index.
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
unsigned getScalarSizeInBits() const
unsigned getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
Helper class that represents how the value of an instruction may be mapped and what is the related co...
void push_back(const T &Elt)
Definition: SmallVector.h:211
bool isScalar() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
unsigned Reg
Optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:953
static std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
LLT getScalarType() const
bool isRegSequence() const
MachineInstrBuilder buildOr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1)
Build and insert Res = G_OR Op0, Op1.
LLT getType(unsigned Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register...
uint64_t getSize() const
Return the size in bytes of the memory reference.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
void setRegBank(unsigned Reg, const RegisterBank &RegBank)
Set the register bank to RegBank for Reg.
Address space for region memory. (GDS)
Definition: AMDGPU.h:271
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:476
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
BinaryOp_match< LHS, RHS, TargetOpcode::G_ADD, true > m_GAdd(const LHS &L, const RHS &R)
bool hasScalarMulHiInsts() const
const PartialMapping * BreakDown
How the value is broken down between the different register banks.
MachineInstrSpan provides an interface to get an iteration range containing the instruction it was in...
bool isIntrinsicID() const
bool isVector() const
unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override
Get the cost of a copy from B to A, or put differently, get the cost of A = COPY B.
A description of a memory reference used in the backend.
LLVM_NODISCARD bool empty() const
Definition: SmallSet.h:155
void setInsertPt(MachineBasicBlock &MBB, MachineBasicBlock::iterator II)
Set the insertion point before the specified position.
MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ANYEXT Op0.
LegalizeResult widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy)
Legalize an instruction by performing the operation on a wider scalar type (for example a 16-bit addi...
const HexagonInstrInfo * TII
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:450
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:413
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
unsigned StartIdx
Number of bits at which this partial mapping starts in the original value.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:410
mir Rename Register Operands
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
bool isSGPRClass(const TargetRegisterClass *RC) const
LLT getElementType() const
Returns the vector&#39;s element type. Only valid for vector types.
MachineInstrBuilder buildSelect(const DstOp &Res, const SrcOp &Tst, const SrcOp &Op0, const SrcOp &Op1, Optional< unsigned > Flags=None)
Build and insert a Res = G_SELECT Tst, Op0, Op1.
const RegClassOrRegBank & getRegClassOrRegBank(unsigned Reg) const
Return the register bank or register class of Reg.
static void substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx)
MachineFunction & getMF()
Getter for the function we currently build.
static unsigned extractDLC(unsigned CachePolicy)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
MachineInstrBuilder buildLShr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
void setReg(Register Reg)
Change the register this operand corresponds to.
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
MachineInstrBuilder buildExtractVectorElement(const DstOp &Res, const SrcOp &Val, const SrcOp &Idx)
Build and insert Res = G_EXTRACT_VECTOR_ELT Val, Idx.
AMDGPURegisterBankInfo(const GCNSubtarget &STI)
const RegisterBank * RegBank
Register bank where the partial value lives.
void setChangeObserver(GISelChangeObserver &Observer)
Address space for private memory.
Definition: AMDGPU.h:275
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool hasScalarCompareEq64() const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineRegisterInfo * getMRI()
Getter for MRI.
Abstract class that contains various methods for clients to notify about changes. ...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
unsigned const MachineRegisterInfo * MRI
static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy)
const InstructionMapping & getInvalidInstructionMapping() const
Method to get a uniquely generated invalid InstructionMapping.
MachineInstrBuilder buildInstr(unsigned Opcode)
Build and insert <empty> = Opcode <empty>.
Address space for local memory.
Definition: AMDGPU.h:274
MachineInstrBuilder buildZExt(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ZEXT Op.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&... args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:655
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
MachineInstrBuilder buildBuildVector(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_BUILD_VECTOR Op0, ...
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Helper class to build MachineInstr.
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned Length
Length of this mapping in bits.
void setType(unsigned VReg, LLT Ty)
Set the low-level type of VReg to Ty.
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:566
const ValueMapping & getValueMapping(unsigned StartIdx, unsigned Length, const RegisterBank &RegBank) const
The most common ValueMapping consists of a single PartialMapping.
void setInstr(MachineInstr &MI)
Set the insertion point to before MI.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:732
static unsigned extractGLC(unsigned CachePolicy)
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:498
constexpr double e
Definition: MathExtras.h:57
self_iterator getIterator()
Definition: ilist_node.h:81
unsigned getAddressSpace() const
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn&#39;t already there.
Definition: SmallSet.h:180
unsigned getBreakDownCost(const ValueMapping &ValMapping, const RegisterBank *CurBank=nullptr) const override
Get the cost of using ValMapping to decompose a register.
R600 Clause Merge
MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op)
Build and insert Res = COPY Op.
MachineInstrBuilder buildTrunc(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_TRUNC Op.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isZero(Register Reg, MachineRegisterInfo &MRI)
uint64_t getAlignment() const
Return the minimum known alignment in bytes of the actual memory reference.
bool hasUnpackedD16VMem() const
bool isValid() const
Check whether this object is valid.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Helper class used to get/create the virtual registers that will be used to replace the MachineOperand...
Iterator for intrusive lists based on ilist_node.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:390
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:551
static LLT getHalfSizedType(LLT Ty)
RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
MachineOperand class - Representation of each machine instruction operand.
const PartialMapping * BreakDown
How the value is broken down between the different register banks.
static void applyDefaultMapping(const OperandsMapper &OpdMapper)
Helper method to apply something that is like the default mapping.
unsigned getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Promote Memory to Register
Definition: Mem2Reg.cpp:109
static MachineInstr * getOtherVRegDef(const MachineRegisterInfo &MRI, Register Reg, const MachineInstr &MI)
LegalizeResult lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty)
Legalize an instruction by splitting it into simpler parts, hopefully understood by the target...
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:111
static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI)
This class implements the register bank concept.
Definition: RegisterBank.h:28
int64_t getImm() const
Helper struct that represents how a value is mapped through different register banks.
MachineInstrBuilder buildAShr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
This file declares the MachineIRBuilder class.
A range adaptor for a pair of iterators.
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const override
Get the alternative mappings for MI.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:255
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Instruction has been legalized and the MachineFunction changed.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC) const override
Get a register bank that covers RC.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB &#39;Other&#39; at the position From, and insert it into this MBB right before &#39;...
ConstantMatch m_ICst(int64_t &Cst)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
MachineInstrBuilder buildAdd(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
Build and insert Res = G_ADD Op0, Op1.
virtual InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const
Get the alternative mappings for MI.
virtual unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const
Get the cost of a copy from B to A, or put differently, get the cost of A = COPY B.
const MachineBasicBlock & getMBB() const
Getter for the basic block we currently build.
void setMBB(MachineBasicBlock &MBB)
Set the insertion point to the end of MBB.
#define I(x, y, z)
Definition: MD5.cpp:58
const InstructionMapping & getInstrMapping() const
The final mapping of the instruction.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const InstructionMapping & getInstrMappingImpl(const MachineInstr &MI) const
Try to get the mapping of MI.
uint32_t Size
Definition: Profile.cpp:46
MachineInstrBuilder buildBitcast(const DstOp &Dst, const SrcOp &Src)
Build and insert Dst = G_BITCAST Src.
void setSimpleHint(unsigned VReg, unsigned PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
MachineRegisterInfo & getMRI() const
The MachineRegisterInfo we used to realize the mapping.
unsigned getNumExplicitDefs() const
Returns the number of non-implicit definitions.
This class provides the information for the target register banks.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr & getMI() const
const TargetRegisterClass * getWaveMaskRegClass() const
Helper struct that represents how a value is mapped through different register banks.
static unsigned regBankUnion(unsigned RB0, unsigned RB1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void insert(iterator MBBI, MachineBasicBlock *MBB)
unsigned getIntrinsicID() const
Returns the Intrinsic::ID for this instruction.
unsigned NumBreakDowns
Number of partial mapping to break down this value.
iterator_range< SmallVectorImpl< Register >::const_iterator > getVRegs(unsigned OpIdx, bool ForDebug=false) const
Get all the virtual registers required to map the OpIdx-th operand of the instruction.
uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
iterator_range< def_instr_iterator > def_instructions(unsigned Reg) const
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:273
operand_type_match m_Reg()
MachineInstrBuilder buildShl(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef< Register > Regs, LLT NewTy)
Replace the current type each register in Regs has with NewTy.
MachineInstrBuilder buildAnd(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1)
Build and insert Res = G_AND Op0, Op1.
MachineInstrBuilder buildMerge(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_MERGE_VALUES Op0, ...
IRTranslator LLVM IR MI
void setRegClass(unsigned Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
MachineInstrBuilder buildUndef(const DstOp &Res)
Build and insert Res = IMPLICIT_DEF.
static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
Simple wrapper observer that takes several observers, and calls each one for each event...
Register getReg() const
getReg - Returns the register number.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:415
const DebugLoc & getDL()
Getter for DebugLoc.
const ValueMapping & getValueMapping(unsigned StartIdx, unsigned Length, const RegisterBank &RegBank) const
The most common ValueMapping consists of a single PartialMapping.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
static unsigned extractSLC(unsigned CachePolicy)
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:47
unsigned getPredicate() const
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:164