LLVM  9.0.0svn
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This currently relies on the scheduler to place loads and stores next to
46 // each other, and then only merges adjacent pairs of instructions. It would
47 // be good to be more flexible with interleaved instructions, and possibly run
48 // before scheduling. It currently missing stores of constants because loading
49 // the constant into the data register is placed between the stores, although
50 // this is arguably a scheduling problem.
51 //
52 // - Live interval recomputing seems inefficient. This currently only matches
53 // one pair, and recomputes live intervals and moves on to the next pair. It
54 // would be better to compute a list of all merges that need to occur.
55 //
56 // - With a list of instructions to process, we can also merge more. If a
57 // cluster of loads have offsets that are too large to fit in the 8-bit
58 // offsets, but are close enough to fit in the 8 bits, we can add to the base
59 // pointer and use the new reduced offsets.
60 //
61 //===----------------------------------------------------------------------===//
62 
63 #include "AMDGPU.h"
64 #include "AMDGPUSubtarget.h"
66 #include "SIInstrInfo.h"
67 #include "SIRegisterInfo.h"
68 #include "Utils/AMDGPUBaseInfo.h"
69 #include "llvm/ADT/ArrayRef.h"
70 #include "llvm/ADT/SmallVector.h"
71 #include "llvm/ADT/StringRef.h"
80 #include "llvm/IR/DebugLoc.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
85 #include <algorithm>
86 #include <cassert>
87 #include <cstdlib>
88 #include <iterator>
89 #include <utility>
90 
91 using namespace llvm;
92 
93 #define DEBUG_TYPE "si-load-store-opt"
94 
95 namespace {
97  UNKNOWN,
98  DS_READ,
99  DS_WRITE,
100  S_BUFFER_LOAD_IMM,
101  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
102  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
103  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
104  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
105  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
106  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
107  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
108  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
109 };
110 
112  SBASE = 0x1,
113  SRSRC = 0x2,
114  SOFFSET = 0x4,
115  VADDR = 0x8,
116  ADDR = 0x10,
117 };
118 
119 class SILoadStoreOptimizer : public MachineFunctionPass {
120  struct CombineInfo {
123  unsigned EltSize;
124  unsigned Offset0;
125  unsigned Offset1;
126  unsigned Width0;
127  unsigned Width1;
128  unsigned BaseOff;
129  InstClassEnum InstClass;
130  bool GLC0;
131  bool GLC1;
132  bool SLC0;
133  bool SLC1;
134  bool UseST64;
135  SmallVector<MachineInstr *, 8> InstsToMove;
136  };
137 
138  struct BaseRegisters {
139  unsigned LoReg = 0;
140  unsigned HiReg = 0;
141 
142  unsigned LoSubReg = 0;
143  unsigned HiSubReg = 0;
144  };
145 
146  struct MemAddress {
147  BaseRegisters Base;
148  int64_t Offset = 0;
149  };
150 
151  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
152 
153 private:
154  const GCNSubtarget *STM = nullptr;
155  const SIInstrInfo *TII = nullptr;
156  const SIRegisterInfo *TRI = nullptr;
157  MachineRegisterInfo *MRI = nullptr;
158  AliasAnalysis *AA = nullptr;
159  bool OptimizeAgain;
160 
161  static bool offsetsCanBeCombined(CombineInfo &CI);
162  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
163  static unsigned getNewOpcode(const CombineInfo &CI);
164  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
165  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
166  unsigned getOpcodeWidth(const MachineInstr &MI);
167  InstClassEnum getInstClass(unsigned Opc);
168  unsigned getRegs(unsigned Opc);
169 
170  bool findMatchingInst(CombineInfo &CI);
171 
172  unsigned read2Opcode(unsigned EltSize) const;
173  unsigned read2ST64Opcode(unsigned EltSize) const;
174  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
175 
176  unsigned write2Opcode(unsigned EltSize) const;
177  unsigned write2ST64Opcode(unsigned EltSize) const;
178  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
179  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
180  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
181  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
182 
183  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
184  int32_t NewOffset);
185  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
186  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
187  Optional<int32_t> extractConstOffset(const MachineOperand &Op);
188  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
189  /// Promotes constant offset to the immediate by adjusting the base. It
190  /// tries to use a base from the nearby instructions that allows it to have
191  /// a 13bit constant offset which gets promoted to the immediate.
192  bool promoteConstantOffsetToImm(MachineInstr &CI,
193  MemInfoMap &Visited,
195 
196 public:
197  static char ID;
198 
199  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
201  }
202 
203  bool optimizeBlock(MachineBasicBlock &MBB);
204 
205  bool runOnMachineFunction(MachineFunction &MF) override;
206 
207  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
208 
209  void getAnalysisUsage(AnalysisUsage &AU) const override {
210  AU.setPreservesCFG();
212 
214  }
215 };
216 
217 } // end anonymous namespace.
218 
219 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
220  "SI Load Store Optimizer", false, false)
223  false, false)
224 
225 char SILoadStoreOptimizer::ID = 0;
226 
227 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
228 
230  return new SILoadStoreOptimizer();
231 }
232 
234  ArrayRef<MachineInstr *> InstsToMove) {
235  MachineBasicBlock *MBB = I->getParent();
236  ++I;
237  for (MachineInstr *MI : InstsToMove) {
238  MI->removeFromParent();
239  MBB->insert(I, MI);
240  }
241 }
242 
243 static void addDefsUsesToList(const MachineInstr &MI,
244  DenseSet<unsigned> &RegDefs,
245  DenseSet<unsigned> &PhysRegUses) {
246  for (const MachineOperand &Op : MI.operands()) {
247  if (Op.isReg()) {
248  if (Op.isDef())
249  RegDefs.insert(Op.getReg());
250  else if (Op.readsReg() &&
252  PhysRegUses.insert(Op.getReg());
253  }
254  }
255 }
256 
259  AliasAnalysis *AA) {
260  // RAW or WAR - cannot reorder
261  // WAW - cannot reorder
262  // RAR - safe to reorder
263  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
264 }
265 
266 // Add MI and its defs to the lists if MI reads one of the defs that are
267 // already in the list. Returns true in that case.
269  DenseSet<unsigned> &PhysRegUses,
271  for (MachineOperand &Use : MI.operands()) {
272  // If one of the defs is read, then there is a use of Def between I and the
273  // instruction that I will potentially be merged with. We will need to move
274  // this instruction after the merged instructions.
275  //
276  // Similarly, if there is a def which is read by an instruction that is to
277  // be moved for merging, then we need to move the def-instruction as well.
278  // This can only happen for physical registers such as M0; virtual
279  // registers are in SSA form.
280  if (Use.isReg() &&
281  ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
282  (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
283  PhysRegUses.count(Use.getReg())))) {
284  Insts.push_back(&MI);
285  addDefsUsesToList(MI, RegDefs, PhysRegUses);
286  return true;
287  }
288  }
289 
290  return false;
291 }
292 
294  ArrayRef<MachineInstr *> InstsToMove,
295  AliasAnalysis *AA) {
296  assert(MemOp.mayLoadOrStore());
297 
298  for (MachineInstr *InstToMove : InstsToMove) {
299  if (!InstToMove->mayLoadOrStore())
300  continue;
301  if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
302  return false;
303  }
304  return true;
305 }
306 
307 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
308  // XXX - Would the same offset be OK? Is there any reason this would happen or
309  // be useful?
310  if (CI.Offset0 == CI.Offset1)
311  return false;
312 
313  // This won't be valid if the offset isn't aligned.
314  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
315  return false;
316 
317  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
318  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
319  CI.UseST64 = false;
320  CI.BaseOff = 0;
321 
322  // Handle SMEM and VMEM instructions.
323  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
324  return (EltOffset0 + CI.Width0 == EltOffset1 ||
325  EltOffset1 + CI.Width1 == EltOffset0) &&
326  CI.GLC0 == CI.GLC1 &&
327  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
328  }
329 
330  // If the offset in elements doesn't fit in 8-bits, we might be able to use
331  // the stride 64 versions.
332  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
333  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
334  CI.Offset0 = EltOffset0 / 64;
335  CI.Offset1 = EltOffset1 / 64;
336  CI.UseST64 = true;
337  return true;
338  }
339 
340  // Check if the new offsets fit in the reduced 8-bit range.
341  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
342  CI.Offset0 = EltOffset0;
343  CI.Offset1 = EltOffset1;
344  return true;
345  }
346 
347  // Try to shift base address to decrease offsets.
348  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
349  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
350 
351  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
352  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
353  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
354  CI.UseST64 = true;
355  return true;
356  }
357 
358  if (isUInt<8>(OffsetDiff)) {
359  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
360  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
361  return true;
362  }
363 
364  return false;
365 }
366 
367 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
368  const CombineInfo &CI) {
369  const unsigned Width = (CI.Width0 + CI.Width1);
370  switch (CI.InstClass) {
371  default:
372  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
373  case S_BUFFER_LOAD_IMM:
374  switch (Width) {
375  default:
376  return false;
377  case 2:
378  case 4:
379  return true;
380  }
381  }
382 }
383 
384 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
385  const unsigned Opc = MI.getOpcode();
386 
387  if (TII->isMUBUF(MI)) {
388  return AMDGPU::getMUBUFDwords(Opc);
389  }
390 
391  switch (Opc) {
392  default:
393  return 0;
394  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
395  return 1;
396  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
397  return 2;
398  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
399  return 4;
400  }
401 }
402 
403 InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
404  if (TII->isMUBUF(Opc)) {
405  const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
406 
407  // If we couldn't identify the opcode, bail out.
408  if (baseOpcode == -1) {
409  return UNKNOWN;
410  }
411 
412  switch (baseOpcode) {
413  default:
414  return UNKNOWN;
415  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
416  return BUFFER_LOAD_OFFEN;
417  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
418  return BUFFER_LOAD_OFFSET;
419  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
420  return BUFFER_STORE_OFFEN;
421  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
422  return BUFFER_STORE_OFFSET;
423  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
424  return BUFFER_LOAD_OFFEN_exact;
425  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
426  return BUFFER_LOAD_OFFSET_exact;
427  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
428  return BUFFER_STORE_OFFEN_exact;
429  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
430  return BUFFER_STORE_OFFSET_exact;
431  }
432  }
433 
434  switch (Opc) {
435  default:
436  return UNKNOWN;
437  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
438  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
439  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
440  return S_BUFFER_LOAD_IMM;
441  case AMDGPU::DS_READ_B32:
442  case AMDGPU::DS_READ_B64:
443  case AMDGPU::DS_READ_B32_gfx9:
444  case AMDGPU::DS_READ_B64_gfx9:
445  return DS_READ;
446  case AMDGPU::DS_WRITE_B32:
447  case AMDGPU::DS_WRITE_B64:
448  case AMDGPU::DS_WRITE_B32_gfx9:
449  case AMDGPU::DS_WRITE_B64_gfx9:
450  return DS_WRITE;
451  }
452 }
453 
454 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
455  if (TII->isMUBUF(Opc)) {
456  unsigned result = 0;
457 
458  if (AMDGPU::getMUBUFHasVAddr(Opc)) {
459  result |= VADDR;
460  }
461 
462  if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
463  result |= SRSRC;
464  }
465 
466  if (AMDGPU::getMUBUFHasSoffset(Opc)) {
467  result |= SOFFSET;
468  }
469 
470  return result;
471  }
472 
473  switch (Opc) {
474  default:
475  return 0;
476  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
477  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
478  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
479  return SBASE;
480  case AMDGPU::DS_READ_B32:
481  case AMDGPU::DS_READ_B64:
482  case AMDGPU::DS_READ_B32_gfx9:
483  case AMDGPU::DS_READ_B64_gfx9:
484  case AMDGPU::DS_WRITE_B32:
485  case AMDGPU::DS_WRITE_B64:
486  case AMDGPU::DS_WRITE_B32_gfx9:
487  case AMDGPU::DS_WRITE_B64_gfx9:
488  return ADDR;
489  }
490 }
491 
492 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
493  MachineBasicBlock *MBB = CI.I->getParent();
495  MachineBasicBlock::iterator MBBI = CI.I;
496 
497  const unsigned Opc = CI.I->getOpcode();
498  const InstClassEnum InstClass = getInstClass(Opc);
499 
500  if (InstClass == UNKNOWN) {
501  return false;
502  }
503 
504  const unsigned Regs = getRegs(Opc);
505 
506  unsigned AddrOpName[5] = {0};
507  int AddrIdx[5];
508  const MachineOperand *AddrReg[5];
509  unsigned NumAddresses = 0;
510 
511  if (Regs & ADDR) {
512  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
513  }
514 
515  if (Regs & SBASE) {
516  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
517  }
518 
519  if (Regs & SRSRC) {
520  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
521  }
522 
523  if (Regs & SOFFSET) {
524  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
525  }
526 
527  if (Regs & VADDR) {
528  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
529  }
530 
531  for (unsigned i = 0; i < NumAddresses; i++) {
532  AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
533  AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
534 
535  // We only ever merge operations with the same base address register, so
536  // don't bother scanning forward if there are no other uses.
537  if (AddrReg[i]->isReg() &&
539  MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
540  return false;
541  }
542 
543  ++MBBI;
544 
545  DenseSet<unsigned> RegDefsToMove;
546  DenseSet<unsigned> PhysRegUsesToMove;
547  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
548 
549  for (; MBBI != E; ++MBBI) {
550  const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
551 
552  if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
553  (IsDS && (MBBI->getOpcode() != Opc))) {
554  // This is not a matching DS instruction, but we can keep looking as
555  // long as one of these conditions are met:
556  // 1. It is safe to move I down past MBBI.
557  // 2. It is safe to move MBBI down past the instruction that I will
558  // be merged into.
559 
560  if (MBBI->hasUnmodeledSideEffects()) {
561  // We can't re-order this instruction with respect to other memory
562  // operations, so we fail both conditions mentioned above.
563  return false;
564  }
565 
566  if (MBBI->mayLoadOrStore() &&
567  (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
568  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
569  // We fail condition #1, but we may still be able to satisfy condition
570  // #2. Add this instruction to the move list and then we will check
571  // if condition #2 holds once we have selected the matching instruction.
572  CI.InstsToMove.push_back(&*MBBI);
573  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
574  continue;
575  }
576 
577  // When we match I with another DS instruction we will be moving I down
578  // to the location of the matched instruction any uses of I will need to
579  // be moved down as well.
580  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
581  CI.InstsToMove);
582  continue;
583  }
584 
585  // Don't merge volatiles.
586  if (MBBI->hasOrderedMemoryRef())
587  return false;
588 
589  // Handle a case like
590  // DS_WRITE_B32 addr, v, idx0
591  // w = DS_READ_B32 addr, idx0
592  // DS_WRITE_B32 addr, f(w), idx1
593  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
594  // merging of the two writes.
595  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
596  CI.InstsToMove))
597  continue;
598 
599  bool Match = true;
600  for (unsigned i = 0; i < NumAddresses; i++) {
601  const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
602 
603  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
604  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
605  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
606  Match = false;
607  break;
608  }
609  continue;
610  }
611 
612  // Check same base pointer. Be careful of subregisters, which can occur
613  // with vectors of pointers.
614  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
615  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
616  Match = false;
617  break;
618  }
619  }
620 
621  if (Match) {
622  int OffsetIdx =
623  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
624  CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
625  CI.Width0 = getOpcodeWidth(*CI.I);
626  CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
627  CI.Width1 = getOpcodeWidth(*MBBI);
628  CI.Paired = MBBI;
629 
630  if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
631  CI.Offset0 &= 0xffff;
632  CI.Offset1 &= 0xffff;
633  } else {
634  CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
635  CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
636  if (CI.InstClass != S_BUFFER_LOAD_IMM) {
637  CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
638  CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
639  }
640  }
641 
642  // Check both offsets fit in the reduced range.
643  // We also need to go through the list of instructions that we plan to
644  // move and make sure they are all safe to move down past the merged
645  // instruction.
646  if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
647  if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
648  return true;
649  }
650 
651  // We've found a load/store that we couldn't merge for some reason.
652  // We could potentially keep looking, but we'd need to make sure that
653  // it was safe to move I and also all the instruction in InstsToMove
654  // down past this instruction.
655  // check if we can move I across MBBI and if we can move all I's users
656  if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
657  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
658  break;
659  }
660  return false;
661 }
662 
663 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
664  if (STM->ldsRequiresM0Init())
665  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
666  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
667 }
668 
669 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
670  if (STM->ldsRequiresM0Init())
671  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
672 
673  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
674  : AMDGPU::DS_READ2ST64_B64_gfx9;
675 }
676 
678 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
679  MachineBasicBlock *MBB = CI.I->getParent();
680 
681  // Be careful, since the addresses could be subregisters themselves in weird
682  // cases, like vectors of pointers.
683  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
684 
685  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
686  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
687 
688  unsigned NewOffset0 = CI.Offset0;
689  unsigned NewOffset1 = CI.Offset1;
690  unsigned Opc =
691  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
692 
693  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
694  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
695 
696  if (NewOffset0 > NewOffset1) {
697  // Canonicalize the merged instruction so the smaller offset comes first.
698  std::swap(NewOffset0, NewOffset1);
699  std::swap(SubRegIdx0, SubRegIdx1);
700  }
701 
702  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
703  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
704 
705  const MCInstrDesc &Read2Desc = TII->get(Opc);
706 
707  const TargetRegisterClass *SuperRC =
708  (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
709  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
710 
711  DebugLoc DL = CI.I->getDebugLoc();
712 
713  unsigned BaseReg = AddrReg->getReg();
714  unsigned BaseSubReg = AddrReg->getSubReg();
715  unsigned BaseRegFlags = 0;
716  if (CI.BaseOff) {
717  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
718  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
719  .addImm(CI.BaseOff);
720 
721  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
722  BaseRegFlags = RegState::Kill;
723 
724  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
725  .addReg(ImmReg)
726  .addReg(AddrReg->getReg(), 0, BaseSubReg);
727  BaseSubReg = 0;
728  }
729 
730  MachineInstrBuilder Read2 =
731  BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
732  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
733  .addImm(NewOffset0) // offset0
734  .addImm(NewOffset1) // offset1
735  .addImm(0) // gds
736  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
737 
738  (void)Read2;
739 
740  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
741 
742  // Copy to the old destination registers.
743  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
744  .add(*Dest0) // Copy to same destination including flags and sub reg.
745  .addReg(DestReg, 0, SubRegIdx0);
746  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
747  .add(*Dest1)
748  .addReg(DestReg, RegState::Kill, SubRegIdx1);
749 
750  moveInstsAfter(Copy1, CI.InstsToMove);
751 
752  MachineBasicBlock::iterator Next = std::next(CI.I);
753  CI.I->eraseFromParent();
754  CI.Paired->eraseFromParent();
755 
756  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
757  return Next;
758 }
759 
760 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
761  if (STM->ldsRequiresM0Init())
762  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
763  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
764  : AMDGPU::DS_WRITE2_B64_gfx9;
765 }
766 
767 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
768  if (STM->ldsRequiresM0Init())
769  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
770  : AMDGPU::DS_WRITE2ST64_B64;
771 
772  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
773  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
774 }
775 
777 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
778  MachineBasicBlock *MBB = CI.I->getParent();
779 
780  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
781  // sure we preserve the subregister index and any register flags set on them.
782  const MachineOperand *AddrReg =
783  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
784  const MachineOperand *Data0 =
785  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
786  const MachineOperand *Data1 =
787  TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
788 
789  unsigned NewOffset0 = CI.Offset0;
790  unsigned NewOffset1 = CI.Offset1;
791  unsigned Opc =
792  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
793 
794  if (NewOffset0 > NewOffset1) {
795  // Canonicalize the merged instruction so the smaller offset comes first.
796  std::swap(NewOffset0, NewOffset1);
797  std::swap(Data0, Data1);
798  }
799 
800  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
801  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
802 
803  const MCInstrDesc &Write2Desc = TII->get(Opc);
804  DebugLoc DL = CI.I->getDebugLoc();
805 
806  unsigned BaseReg = AddrReg->getReg();
807  unsigned BaseSubReg = AddrReg->getSubReg();
808  unsigned BaseRegFlags = 0;
809  if (CI.BaseOff) {
810  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
811  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
812  .addImm(CI.BaseOff);
813 
814  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
815  BaseRegFlags = RegState::Kill;
816 
817  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
818  .addReg(ImmReg)
819  .addReg(AddrReg->getReg(), 0, BaseSubReg);
820  BaseSubReg = 0;
821  }
822 
823  MachineInstrBuilder Write2 =
824  BuildMI(*MBB, CI.Paired, DL, Write2Desc)
825  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
826  .add(*Data0) // data0
827  .add(*Data1) // data1
828  .addImm(NewOffset0) // offset0
829  .addImm(NewOffset1) // offset1
830  .addImm(0) // gds
831  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
832 
833  moveInstsAfter(Write2, CI.InstsToMove);
834 
835  MachineBasicBlock::iterator Next = std::next(CI.I);
836  CI.I->eraseFromParent();
837  CI.Paired->eraseFromParent();
838 
839  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
840  return Next;
841 }
842 
844 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
845  MachineBasicBlock *MBB = CI.I->getParent();
846  DebugLoc DL = CI.I->getDebugLoc();
847  const unsigned Opcode = getNewOpcode(CI);
848 
849  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
850 
851  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
852  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
853 
854  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
855  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
856  .addImm(MergedOffset) // offset
857  .addImm(CI.GLC0) // glc
858  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
859 
860  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
861  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
862  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
863 
864  // Copy to the old destination registers.
865  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
866  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
867  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
868 
869  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
870  .add(*Dest0) // Copy to same destination including flags and sub reg.
871  .addReg(DestReg, 0, SubRegIdx0);
872  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
873  .add(*Dest1)
874  .addReg(DestReg, RegState::Kill, SubRegIdx1);
875 
876  moveInstsAfter(Copy1, CI.InstsToMove);
877 
878  MachineBasicBlock::iterator Next = std::next(CI.I);
879  CI.I->eraseFromParent();
880  CI.Paired->eraseFromParent();
881  return Next;
882 }
883 
885 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
886  MachineBasicBlock *MBB = CI.I->getParent();
887  DebugLoc DL = CI.I->getDebugLoc();
888 
889  const unsigned Opcode = getNewOpcode(CI);
890 
891  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
892 
893  // Copy to the new source register.
894  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
895  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
896 
897  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
898 
899  const unsigned Regs = getRegs(Opcode);
900 
901  if (Regs & VADDR)
902  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
903 
904  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
905  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
906  .addImm(MergedOffset) // offset
907  .addImm(CI.GLC0) // glc
908  .addImm(CI.SLC0) // slc
909  .addImm(0) // tfe
910  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
911 
912  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
913  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
914  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
915 
916  // Copy to the old destination registers.
917  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
918  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
919  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
920 
921  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
922  .add(*Dest0) // Copy to same destination including flags and sub reg.
923  .addReg(DestReg, 0, SubRegIdx0);
924  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
925  .add(*Dest1)
926  .addReg(DestReg, RegState::Kill, SubRegIdx1);
927 
928  moveInstsAfter(Copy1, CI.InstsToMove);
929 
930  MachineBasicBlock::iterator Next = std::next(CI.I);
931  CI.I->eraseFromParent();
932  CI.Paired->eraseFromParent();
933  return Next;
934 }
935 
936 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
937  const unsigned Width = CI.Width0 + CI.Width1;
938 
939  switch (CI.InstClass) {
940  default:
941  return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
942  case UNKNOWN:
943  llvm_unreachable("Unknown instruction class");
944  case S_BUFFER_LOAD_IMM:
945  switch (Width) {
946  default:
947  return 0;
948  case 2:
949  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
950  case 4:
951  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
952  }
953  }
954 }
955 
956 std::pair<unsigned, unsigned>
957 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
958  if (CI.Offset0 > CI.Offset1) {
959  switch (CI.Width0) {
960  default:
961  return std::make_pair(0, 0);
962  case 1:
963  switch (CI.Width1) {
964  default:
965  return std::make_pair(0, 0);
966  case 1:
967  return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
968  case 2:
969  return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
970  case 3:
971  return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
972  }
973  case 2:
974  switch (CI.Width1) {
975  default:
976  return std::make_pair(0, 0);
977  case 1:
978  return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
979  case 2:
980  return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
981  }
982  case 3:
983  switch (CI.Width1) {
984  default:
985  return std::make_pair(0, 0);
986  case 1:
987  return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
988  }
989  }
990  } else {
991  switch (CI.Width0) {
992  default:
993  return std::make_pair(0, 0);
994  case 1:
995  switch (CI.Width1) {
996  default:
997  return std::make_pair(0, 0);
998  case 1:
999  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1000  case 2:
1001  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1002  case 3:
1003  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1004  }
1005  case 2:
1006  switch (CI.Width1) {
1007  default:
1008  return std::make_pair(0, 0);
1009  case 1:
1010  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1011  case 2:
1012  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1013  }
1014  case 3:
1015  switch (CI.Width1) {
1016  default:
1017  return std::make_pair(0, 0);
1018  case 1:
1019  return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1020  }
1021  }
1022  }
1023 }
1024 
1025 const TargetRegisterClass *
1026 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1027  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1028  switch (CI.Width0 + CI.Width1) {
1029  default:
1030  return nullptr;
1031  case 2:
1032  return &AMDGPU::SReg_64_XEXECRegClass;
1033  case 4:
1034  return &AMDGPU::SReg_128RegClass;
1035  case 8:
1036  return &AMDGPU::SReg_256RegClass;
1037  case 16:
1038  return &AMDGPU::SReg_512RegClass;
1039  }
1040  } else {
1041  switch (CI.Width0 + CI.Width1) {
1042  default:
1043  return nullptr;
1044  case 2:
1045  return &AMDGPU::VReg_64RegClass;
1046  case 3:
1047  return &AMDGPU::VReg_96RegClass;
1048  case 4:
1049  return &AMDGPU::VReg_128RegClass;
1050  }
1051  }
1052 }
1053 
1055 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1056  MachineBasicBlock *MBB = CI.I->getParent();
1057  DebugLoc DL = CI.I->getDebugLoc();
1058 
1059  const unsigned Opcode = getNewOpcode(CI);
1060 
1061  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1062  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1063  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1064 
1065  // Copy to the new source register.
1066  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1067  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
1068 
1069  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1070  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1071 
1072  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1073  .add(*Src0)
1074  .addImm(SubRegIdx0)
1075  .add(*Src1)
1076  .addImm(SubRegIdx1);
1077 
1078  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1079  .addReg(SrcReg, RegState::Kill);
1080 
1081  const unsigned Regs = getRegs(Opcode);
1082 
1083  if (Regs & VADDR)
1084  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1085 
1086  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1087  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1088  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1089  .addImm(CI.GLC0) // glc
1090  .addImm(CI.SLC0) // slc
1091  .addImm(0) // tfe
1092  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1093 
1094  moveInstsAfter(MIB, CI.InstsToMove);
1095 
1096  MachineBasicBlock::iterator Next = std::next(CI.I);
1097  CI.I->eraseFromParent();
1098  CI.Paired->eraseFromParent();
1099  return Next;
1100 }
1101 
1103 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1104  APInt V(32, Val, true);
1105  if (TII->isInlineConstant(V))
1106  return MachineOperand::CreateImm(Val);
1107 
1108  unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1109  MachineInstr *Mov =
1110  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1111  TII->get(AMDGPU::S_MOV_B32), Reg)
1112  .addImm(Val);
1113  (void)Mov;
1114  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1115  return MachineOperand::CreateReg(Reg, false);
1116 }
1117 
1118 // Compute base address using Addr and return the final register.
1119 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1120  const MemAddress &Addr) {
1121  MachineBasicBlock *MBB = MI.getParent();
1123  DebugLoc DL = MI.getDebugLoc();
1124 
1125  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1126  Addr.Base.LoSubReg) &&
1127  "Expected 32-bit Base-Register-Low!!");
1128 
1129  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1130  Addr.Base.HiSubReg) &&
1131  "Expected 32-bit Base-Register-Hi!!");
1132 
1133  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1134  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1135  MachineOperand OffsetHi =
1136  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1137  unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1138  unsigned DeadCarryReg =
1139  MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1140 
1141  unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1142  unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1143  MachineInstr *LoHalf =
1144  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1145  .addReg(CarryReg, RegState::Define)
1146  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1147  .add(OffsetLo);
1148  (void)LoHalf;
1149  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1150 
1151  MachineInstr *HiHalf =
1152  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1153  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1154  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1155  .add(OffsetHi)
1156  .addReg(CarryReg, RegState::Kill);
1157  (void)HiHalf;
1158  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1159 
1160  unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1161  MachineInstr *FullBase =
1162  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1163  .addReg(DestSub0)
1164  .addImm(AMDGPU::sub0)
1165  .addReg(DestSub1)
1166  .addImm(AMDGPU::sub1);
1167  (void)FullBase;
1168  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1169 
1170  return FullDestReg;
1171 }
1172 
1173 // Update base and offset with the NewBase and NewOffset in MI.
1174 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1175  unsigned NewBase,
1176  int32_t NewOffset) {
1177  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1178  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1179 }
1180 
1182 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1183  if (Op.isImm())
1184  return Op.getImm();
1185 
1186  if (!Op.isReg())
1187  return None;
1188 
1189  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1190  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1191  !Def->getOperand(1).isImm())
1192  return None;
1193 
1194  return Def->getOperand(1).getImm();
1195 }
1196 
1197 // Analyze Base and extracts:
1198 // - 32bit base registers, subregisters
1199 // - 64bit constant offset
1200 // Expecting base computation as:
1201 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1202 // %LO:vgpr_32, %c:sreg_64_xexec =
1203 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1204 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1205 // %Base:vreg_64 =
1206 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1207 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1208  MemAddress &Addr) {
1209  if (!Base.isReg())
1210  return;
1211 
1212  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1213  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1214  || Def->getNumOperands() != 5)
1215  return;
1216 
1217  MachineOperand BaseLo = Def->getOperand(1);
1218  MachineOperand BaseHi = Def->getOperand(3);
1219  if (!BaseLo.isReg() || !BaseHi.isReg())
1220  return;
1221 
1222  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1223  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1224 
1225  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1226  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1227  return;
1228 
1229  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1230  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1231 
1232  auto Offset0P = extractConstOffset(*Src0);
1233  if (Offset0P)
1234  BaseLo = *Src1;
1235  else {
1236  if (!(Offset0P = extractConstOffset(*Src1)))
1237  return;
1238  BaseLo = *Src0;
1239  }
1240 
1241  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1242  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1243 
1244  if (Src0->isImm())
1245  std::swap(Src0, Src1);
1246 
1247  if (!Src1->isImm())
1248  return;
1249 
1250  uint64_t Offset1 = Src1->getImm();
1251  BaseHi = *Src0;
1252 
1253  Addr.Base.LoReg = BaseLo.getReg();
1254  Addr.Base.HiReg = BaseHi.getReg();
1255  Addr.Base.LoSubReg = BaseLo.getSubReg();
1256  Addr.Base.HiSubReg = BaseHi.getSubReg();
1257  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1258 }
1259 
1260 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1261  MachineInstr &MI,
1262  MemInfoMap &Visited,
1263  SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1264 
1265  // TODO: Support flat and scratch.
1266  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1267  TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1268  return false;
1269 
1270  // TODO: Support Store.
1271  if (!MI.mayLoad())
1272  return false;
1273 
1274  if (AnchorList.count(&MI))
1275  return false;
1276 
1277  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1278 
1279  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1280  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1281  return false;
1282  }
1283 
1284  // Step1: Find the base-registers and a 64bit constant offset.
1285  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1286  MemAddress MAddr;
1287  if (Visited.find(&MI) == Visited.end()) {
1288  processBaseWithConstOffset(Base, MAddr);
1289  Visited[&MI] = MAddr;
1290  } else
1291  MAddr = Visited[&MI];
1292 
1293  if (MAddr.Offset == 0) {
1294  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1295  " constant offsets that can be promoted.\n";);
1296  return false;
1297  }
1298 
1299  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1300  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1301 
1302  // Step2: Traverse through MI's basic block and find an anchor(that has the
1303  // same base-registers) with the highest 13bit distance from MI's offset.
1304  // E.g. (64bit loads)
1305  // bb:
1306  // addr1 = &a + 4096; load1 = load(addr1, 0)
1307  // addr2 = &a + 6144; load2 = load(addr2, 0)
1308  // addr3 = &a + 8192; load3 = load(addr3, 0)
1309  // addr4 = &a + 10240; load4 = load(addr4, 0)
1310  // addr5 = &a + 12288; load5 = load(addr5, 0)
1311  //
1312  // Starting from the first load, the optimization will try to find a new base
1313  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1314  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1315  // as the new-base(anchor) because of the maximum distance which can
1316  // accomodate more intermediate bases presumeably.
1317  //
1318  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1319  // (&a + 8192) for load1, load2, load4.
1320  // addr = &a + 8192
1321  // load1 = load(addr, -4096)
1322  // load2 = load(addr, -2048)
1323  // load3 = load(addr, 0)
1324  // load4 = load(addr, 2048)
1325  // addr5 = &a + 12288; load5 = load(addr5, 0)
1326  //
1327  MachineInstr *AnchorInst = nullptr;
1328  MemAddress AnchorAddr;
1329  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1331 
1332  MachineBasicBlock *MBB = MI.getParent();
1335  ++MBBI;
1336  const SITargetLowering *TLI =
1337  static_cast<const SITargetLowering *>(STM->getTargetLowering());
1338 
1339  for ( ; MBBI != E; ++MBBI) {
1340  MachineInstr &MINext = *MBBI;
1341  // TODO: Support finding an anchor(with same base) from store addresses or
1342  // any other load addresses where the opcodes are different.
1343  if (MINext.getOpcode() != MI.getOpcode() ||
1344  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1345  continue;
1346 
1347  const MachineOperand &BaseNext =
1348  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1349  MemAddress MAddrNext;
1350  if (Visited.find(&MINext) == Visited.end()) {
1351  processBaseWithConstOffset(BaseNext, MAddrNext);
1352  Visited[&MINext] = MAddrNext;
1353  } else
1354  MAddrNext = Visited[&MINext];
1355 
1356  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1357  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1358  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1359  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1360  continue;
1361 
1362  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1363 
1364  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1366  AM.HasBaseReg = true;
1367  AM.BaseOffs = Dist;
1368  if (TLI->isLegalGlobalAddressingMode(AM) &&
1369  (uint32_t)std::abs(Dist) > MaxDist) {
1370  MaxDist = std::abs(Dist);
1371 
1372  AnchorAddr = MAddrNext;
1373  AnchorInst = &MINext;
1374  }
1375  }
1376 
1377  if (AnchorInst) {
1378  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1379  AnchorInst->dump());
1380  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1381  << AnchorAddr.Offset << "\n\n");
1382 
1383  // Instead of moving up, just re-compute anchor-instruction's base address.
1384  unsigned Base = computeBase(MI, AnchorAddr);
1385 
1386  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1387  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1388 
1389  for (auto P : InstsWCommonBase) {
1391  AM.HasBaseReg = true;
1392  AM.BaseOffs = P.second - AnchorAddr.Offset;
1393 
1394  if (TLI->isLegalGlobalAddressingMode(AM)) {
1395  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1396  dbgs() << ")"; P.first->dump());
1397  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1398  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1399  }
1400  }
1401  AnchorList.insert(AnchorInst);
1402  return true;
1403  }
1404 
1405  return false;
1406 }
1407 
1408 // Scan through looking for adjacent LDS operations with constant offsets from
1409 // the same base register. We rely on the scheduler to do the hard work of
1410 // clustering nearby loads, and assume these are all adjacent.
1411 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
1412  bool Modified = false;
1413 
1414  // Contain the list
1415  MemInfoMap Visited;
1416  // Contains the list of instructions for which constant offsets are being
1417  // promoted to the IMM.
1418  SmallPtrSet<MachineInstr *, 4> AnchorList;
1419 
1420  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1421  MachineInstr &MI = *I;
1422 
1423  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1424  Modified = true;
1425 
1426  // Don't combine if volatile.
1427  if (MI.hasOrderedMemoryRef()) {
1428  ++I;
1429  continue;
1430  }
1431 
1432  const unsigned Opc = MI.getOpcode();
1433 
1434  CombineInfo CI;
1435  CI.I = I;
1436  CI.InstClass = getInstClass(Opc);
1437 
1438  switch (CI.InstClass) {
1439  default:
1440  break;
1441  case DS_READ:
1442  CI.EltSize =
1443  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1444  : 4;
1445  if (findMatchingInst(CI)) {
1446  Modified = true;
1447  I = mergeRead2Pair(CI);
1448  } else {
1449  ++I;
1450  }
1451  continue;
1452  case DS_WRITE:
1453  CI.EltSize =
1454  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1455  : 4;
1456  if (findMatchingInst(CI)) {
1457  Modified = true;
1458  I = mergeWrite2Pair(CI);
1459  } else {
1460  ++I;
1461  }
1462  continue;
1463  case S_BUFFER_LOAD_IMM:
1464  CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
1465  if (findMatchingInst(CI)) {
1466  Modified = true;
1467  I = mergeSBufferLoadImmPair(CI);
1468  OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
1469  } else {
1470  ++I;
1471  }
1472  continue;
1473  case BUFFER_LOAD_OFFEN:
1474  case BUFFER_LOAD_OFFSET:
1475  case BUFFER_LOAD_OFFEN_exact:
1476  case BUFFER_LOAD_OFFSET_exact:
1477  CI.EltSize = 4;
1478  if (findMatchingInst(CI)) {
1479  Modified = true;
1480  I = mergeBufferLoadPair(CI);
1481  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1482  } else {
1483  ++I;
1484  }
1485  continue;
1486  case BUFFER_STORE_OFFEN:
1487  case BUFFER_STORE_OFFSET:
1488  case BUFFER_STORE_OFFEN_exact:
1489  case BUFFER_STORE_OFFSET_exact:
1490  CI.EltSize = 4;
1491  if (findMatchingInst(CI)) {
1492  Modified = true;
1493  I = mergeBufferStorePair(CI);
1494  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1495  } else {
1496  ++I;
1497  }
1498  continue;
1499  }
1500 
1501  ++I;
1502  }
1503 
1504  return Modified;
1505 }
1506 
1507 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1508  if (skipFunction(MF.getFunction()))
1509  return false;
1510 
1511  STM = &MF.getSubtarget<GCNSubtarget>();
1512  if (!STM->loadStoreOptEnabled())
1513  return false;
1514 
1515  TII = STM->getInstrInfo();
1516  TRI = &TII->getRegisterInfo();
1517 
1518  MRI = &MF.getRegInfo();
1519  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1520 
1521  assert(MRI->isSSA() && "Must be run on SSA");
1522 
1523  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1524 
1525  bool Modified = false;
1526 
1527  for (MachineBasicBlock &MBB : MF) {
1528  do {
1529  OptimizeAgain = false;
1530  Modified |= optimizeBlock(MBB);
1531  } while (OptimizeAgain);
1532  }
1533 
1534  return Modified;
1535 }
static bool isReg(const MCInst &MI, unsigned OpNo)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords)
const MachineInstrBuilder & add(const MachineOperand &MO) const
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Interface definition for SIRegisterInfo.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
bool getMUBUFHasSrsrc(unsigned Opc)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
char & SILoadStoreOptimizerID
void push_back(const T &Elt)
Definition: SmallVector.h:211
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:382
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:163
unsigned getReg() const
getReg - Returns the register number.
unsigned Reg
unsigned getSubReg() const
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:829
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
int getMUBUFDwords(unsigned Opc)
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:458
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateReg(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
bool hasDwordx3LoadStores() const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:411
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:408
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, AliasAnalysis *AA)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, AliasAnalysis *AA)
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
#define P(N)
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:342
unsigned const MachineRegisterInfo * MRI
bool getMUBUFHasSoffset(unsigned Opc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:187
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
int getMUBUFBaseOpcode(unsigned Opc)
Represent the analysis usage information of a pass.
SI Load Store Optimizer
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
self_iterator getIterator()
Definition: ilist_node.h:81
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:839
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:285
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
Class for arbitrary precision integers.
Definition: APInt.h:69
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:253
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool loadStoreOptEnabled() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static MachineOperand CreateImm(int64_t Val)
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1212
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:91
bool isReg() const
isReg - Tests if this is a MO_Register operand.
FunctionPass * createSILoadStoreOptimizerPass()
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be iniitalized. ...
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:806
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
bool getMUBUFHasVAddr(unsigned Opc)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const SITargetLowering * getTargetLowering() const override
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:413
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)