LLVM  8.0.0svn
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass tries to fuse DS instructions with close by immediate offsets.
11 // This will fuse operations such as
12 // ds_read_b32 v0, v2 offset:16
13 // ds_read_b32 v1, v2 offset:32
14 // ==>
15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 //
17 // The same is done for certain SMEM and VMEM opcodes, e.g.:
18 // s_buffer_load_dword s4, s[0:3], 4
19 // s_buffer_load_dword s5, s[0:3], 8
20 // ==>
21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 //
23 // This pass also tries to promote constant offset to the immediate by
24 // adjusting the base. It tries to use a base from the nearby instructions that
25 // allows it to have a 13bit constant offset and then promotes the 13bit offset
26 // to the immediate.
27 // E.g.
28 // s_movk_i32 s0, 0x1800
29 // v_add_co_u32_e32 v0, vcc, s0, v2
30 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
31 //
32 // s_movk_i32 s0, 0x1000
33 // v_add_co_u32_e32 v5, vcc, s0, v2
34 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
35 // global_load_dwordx2 v[5:6], v[5:6], off
36 // global_load_dwordx2 v[0:1], v[0:1], off
37 // =>
38 // s_movk_i32 s0, 0x1000
39 // v_add_co_u32_e32 v5, vcc, s0, v2
40 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
41 // global_load_dwordx2 v[5:6], v[5:6], off
42 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
43 //
44 // Future improvements:
45 //
46 // - This currently relies on the scheduler to place loads and stores next to
47 // each other, and then only merges adjacent pairs of instructions. It would
48 // be good to be more flexible with interleaved instructions, and possibly run
49 // before scheduling. It currently missing stores of constants because loading
50 // the constant into the data register is placed between the stores, although
51 // this is arguably a scheduling problem.
52 //
53 // - Live interval recomputing seems inefficient. This currently only matches
54 // one pair, and recomputes live intervals and moves on to the next pair. It
55 // would be better to compute a list of all merges that need to occur.
56 //
57 // - With a list of instructions to process, we can also merge more. If a
58 // cluster of loads have offsets that are too large to fit in the 8-bit
59 // offsets, but are close enough to fit in the 8 bits, we can add to the base
60 // pointer and use the new reduced offsets.
61 //
62 //===----------------------------------------------------------------------===//
63 
64 #include "AMDGPU.h"
65 #include "AMDGPUSubtarget.h"
67 #include "SIInstrInfo.h"
68 #include "SIRegisterInfo.h"
69 #include "Utils/AMDGPUBaseInfo.h"
70 #include "llvm/ADT/ArrayRef.h"
71 #include "llvm/ADT/SmallVector.h"
72 #include "llvm/ADT/StringRef.h"
81 #include "llvm/IR/DebugLoc.h"
82 #include "llvm/Pass.h"
83 #include "llvm/Support/Debug.h"
86 #include <algorithm>
87 #include <cassert>
88 #include <cstdlib>
89 #include <iterator>
90 #include <utility>
91 
92 using namespace llvm;
93 
94 #define DEBUG_TYPE "si-load-store-opt"
95 
96 namespace {
98  UNKNOWN,
99  DS_READ,
100  DS_WRITE,
101  S_BUFFER_LOAD_IMM,
102  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
103  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
104  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
105  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
106  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
107  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
108  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
109  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
110 };
111 
113  SBASE = 0x1,
114  SRSRC = 0x2,
115  SOFFSET = 0x4,
116  VADDR = 0x8,
117  ADDR = 0x10,
118 };
119 
120 class SILoadStoreOptimizer : public MachineFunctionPass {
121  struct CombineInfo {
124  unsigned EltSize;
125  unsigned Offset0;
126  unsigned Offset1;
127  unsigned Width0;
128  unsigned Width1;
129  unsigned BaseOff;
130  InstClassEnum InstClass;
131  bool GLC0;
132  bool GLC1;
133  bool SLC0;
134  bool SLC1;
135  bool UseST64;
136  SmallVector<MachineInstr *, 8> InstsToMove;
137  };
138 
139  struct BaseRegisters {
140  unsigned LoReg = 0;
141  unsigned HiReg = 0;
142 
143  unsigned LoSubReg = 0;
144  unsigned HiSubReg = 0;
145  };
146 
147  struct MemAddress {
148  BaseRegisters Base;
149  int64_t Offset = 0;
150  };
151 
152  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
153 
154 private:
155  const GCNSubtarget *STM = nullptr;
156  const SIInstrInfo *TII = nullptr;
157  const SIRegisterInfo *TRI = nullptr;
158  MachineRegisterInfo *MRI = nullptr;
159  AliasAnalysis *AA = nullptr;
160  bool OptimizeAgain;
161 
162  static bool offsetsCanBeCombined(CombineInfo &CI);
163  static bool widthsFit(const CombineInfo &CI);
164  static unsigned getNewOpcode(const CombineInfo &CI);
165  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
166  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
167  unsigned getOpcodeWidth(const MachineInstr &MI);
168  InstClassEnum getInstClass(unsigned Opc);
169  unsigned getRegs(unsigned Opc);
170 
171  bool findMatchingInst(CombineInfo &CI);
172 
173  unsigned read2Opcode(unsigned EltSize) const;
174  unsigned read2ST64Opcode(unsigned EltSize) const;
175  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
176 
177  unsigned write2Opcode(unsigned EltSize) const;
178  unsigned write2ST64Opcode(unsigned EltSize) const;
179  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
180  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
181  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
182  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
183 
184  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
185  int32_t NewOffset);
186  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
187  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
188  Optional<int32_t> extractConstOffset(const MachineOperand &Op);
189  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
190  /// Promotes constant offset to the immediate by adjusting the base. It
191  /// tries to use a base from the nearby instructions that allows it to have
192  /// a 13bit constant offset which gets promoted to the immediate.
193  bool promoteConstantOffsetToImm(MachineInstr &CI,
194  MemInfoMap &Visited,
196 
197 public:
198  static char ID;
199 
200  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
202  }
203 
204  bool optimizeBlock(MachineBasicBlock &MBB);
205 
206  bool runOnMachineFunction(MachineFunction &MF) override;
207 
208  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
209 
210  void getAnalysisUsage(AnalysisUsage &AU) const override {
211  AU.setPreservesCFG();
213 
215  }
216 };
217 
218 } // end anonymous namespace.
219 
220 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
221  "SI Load Store Optimizer", false, false)
224  false, false)
225 
226 char SILoadStoreOptimizer::ID = 0;
227 
228 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
229 
231  return new SILoadStoreOptimizer();
232 }
233 
235  ArrayRef<MachineInstr *> InstsToMove) {
236  MachineBasicBlock *MBB = I->getParent();
237  ++I;
238  for (MachineInstr *MI : InstsToMove) {
239  MI->removeFromParent();
240  MBB->insert(I, MI);
241  }
242 }
243 
244 static void addDefsUsesToList(const MachineInstr &MI,
245  DenseSet<unsigned> &RegDefs,
246  DenseSet<unsigned> &PhysRegUses) {
247  for (const MachineOperand &Op : MI.operands()) {
248  if (Op.isReg()) {
249  if (Op.isDef())
250  RegDefs.insert(Op.getReg());
251  else if (Op.readsReg() &&
253  PhysRegUses.insert(Op.getReg());
254  }
255  }
256 }
257 
260  const SIInstrInfo *TII,
261  AliasAnalysis *AA) {
262  // RAW or WAR - cannot reorder
263  // WAW - cannot reorder
264  // RAR - safe to reorder
265  return !(A->mayStore() || B->mayStore()) ||
266  TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
267 }
268 
269 // Add MI and its defs to the lists if MI reads one of the defs that are
270 // already in the list. Returns true in that case.
272  DenseSet<unsigned> &PhysRegUses,
274  for (MachineOperand &Use : MI.operands()) {
275  // If one of the defs is read, then there is a use of Def between I and the
276  // instruction that I will potentially be merged with. We will need to move
277  // this instruction after the merged instructions.
278  //
279  // Similarly, if there is a def which is read by an instruction that is to
280  // be moved for merging, then we need to move the def-instruction as well.
281  // This can only happen for physical registers such as M0; virtual
282  // registers are in SSA form.
283  if (Use.isReg() &&
284  ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
285  (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
286  PhysRegUses.count(Use.getReg())))) {
287  Insts.push_back(&MI);
288  addDefsUsesToList(MI, RegDefs, PhysRegUses);
289  return true;
290  }
291  }
292 
293  return false;
294 }
295 
297  ArrayRef<MachineInstr *> InstsToMove,
298  const SIInstrInfo *TII, AliasAnalysis *AA) {
299  assert(MemOp.mayLoadOrStore());
300 
301  for (MachineInstr *InstToMove : InstsToMove) {
302  if (!InstToMove->mayLoadOrStore())
303  continue;
304  if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
305  return false;
306  }
307  return true;
308 }
309 
310 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
311  // XXX - Would the same offset be OK? Is there any reason this would happen or
312  // be useful?
313  if (CI.Offset0 == CI.Offset1)
314  return false;
315 
316  // This won't be valid if the offset isn't aligned.
317  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
318  return false;
319 
320  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
321  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
322  CI.UseST64 = false;
323  CI.BaseOff = 0;
324 
325  // Handle SMEM and VMEM instructions.
326  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
327  return (EltOffset0 + CI.Width0 == EltOffset1 ||
328  EltOffset1 + CI.Width1 == EltOffset0) &&
329  CI.GLC0 == CI.GLC1 &&
330  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
331  }
332 
333  // If the offset in elements doesn't fit in 8-bits, we might be able to use
334  // the stride 64 versions.
335  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
336  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
337  CI.Offset0 = EltOffset0 / 64;
338  CI.Offset1 = EltOffset1 / 64;
339  CI.UseST64 = true;
340  return true;
341  }
342 
343  // Check if the new offsets fit in the reduced 8-bit range.
344  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
345  CI.Offset0 = EltOffset0;
346  CI.Offset1 = EltOffset1;
347  return true;
348  }
349 
350  // Try to shift base address to decrease offsets.
351  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
352  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
353 
354  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
355  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
356  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
357  CI.UseST64 = true;
358  return true;
359  }
360 
361  if (isUInt<8>(OffsetDiff)) {
362  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
363  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
364  return true;
365  }
366 
367  return false;
368 }
369 
370 bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) {
371  const unsigned Width = (CI.Width0 + CI.Width1);
372  switch (CI.InstClass) {
373  default:
374  return Width <= 4;
375  case S_BUFFER_LOAD_IMM:
376  switch (Width) {
377  default:
378  return false;
379  case 2:
380  case 4:
381  return true;
382  }
383  }
384 }
385 
386 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
387  const unsigned Opc = MI.getOpcode();
388 
389  if (TII->isMUBUF(MI)) {
390  return AMDGPU::getMUBUFDwords(Opc);
391  }
392 
393  switch (Opc) {
394  default:
395  return 0;
396  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
397  return 1;
398  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
399  return 2;
400  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
401  return 4;
402  }
403 }
404 
405 InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
406  if (TII->isMUBUF(Opc)) {
407  const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
408 
409  // If we couldn't identify the opcode, bail out.
410  if (baseOpcode == -1) {
411  return UNKNOWN;
412  }
413 
414  switch (baseOpcode) {
415  default:
416  return UNKNOWN;
417  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
418  return BUFFER_LOAD_OFFEN;
419  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
420  return BUFFER_LOAD_OFFSET;
421  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
422  return BUFFER_STORE_OFFEN;
423  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
424  return BUFFER_STORE_OFFSET;
425  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
426  return BUFFER_LOAD_OFFEN_exact;
427  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
428  return BUFFER_LOAD_OFFSET_exact;
429  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
430  return BUFFER_STORE_OFFEN_exact;
431  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
432  return BUFFER_STORE_OFFSET_exact;
433  }
434  }
435 
436  switch (Opc) {
437  default:
438  return UNKNOWN;
439  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
440  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
441  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
442  return S_BUFFER_LOAD_IMM;
443  case AMDGPU::DS_READ_B32:
444  case AMDGPU::DS_READ_B64:
445  case AMDGPU::DS_READ_B32_gfx9:
446  case AMDGPU::DS_READ_B64_gfx9:
447  return DS_READ;
448  case AMDGPU::DS_WRITE_B32:
449  case AMDGPU::DS_WRITE_B64:
450  case AMDGPU::DS_WRITE_B32_gfx9:
451  case AMDGPU::DS_WRITE_B64_gfx9:
452  return DS_WRITE;
453  }
454 }
455 
456 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
457  if (TII->isMUBUF(Opc)) {
458  unsigned result = 0;
459 
460  if (AMDGPU::getMUBUFHasVAddr(Opc)) {
461  result |= VADDR;
462  }
463 
464  if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
465  result |= SRSRC;
466  }
467 
468  if (AMDGPU::getMUBUFHasSoffset(Opc)) {
469  result |= SOFFSET;
470  }
471 
472  return result;
473  }
474 
475  switch (Opc) {
476  default:
477  return 0;
478  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
479  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
480  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
481  return SBASE;
482  case AMDGPU::DS_READ_B32:
483  case AMDGPU::DS_READ_B64:
484  case AMDGPU::DS_READ_B32_gfx9:
485  case AMDGPU::DS_READ_B64_gfx9:
486  case AMDGPU::DS_WRITE_B32:
487  case AMDGPU::DS_WRITE_B64:
488  case AMDGPU::DS_WRITE_B32_gfx9:
489  case AMDGPU::DS_WRITE_B64_gfx9:
490  return ADDR;
491  }
492 }
493 
494 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
495  MachineBasicBlock *MBB = CI.I->getParent();
497  MachineBasicBlock::iterator MBBI = CI.I;
498 
499  const unsigned Opc = CI.I->getOpcode();
500  const InstClassEnum InstClass = getInstClass(Opc);
501 
502  if (InstClass == UNKNOWN) {
503  return false;
504  }
505 
506  const unsigned Regs = getRegs(Opc);
507 
508  unsigned AddrOpName[5] = {0};
509  int AddrIdx[5];
510  const MachineOperand *AddrReg[5];
511  unsigned NumAddresses = 0;
512 
513  if (Regs & ADDR) {
514  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
515  }
516 
517  if (Regs & SBASE) {
518  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
519  }
520 
521  if (Regs & SRSRC) {
522  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
523  }
524 
525  if (Regs & SOFFSET) {
526  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
527  }
528 
529  if (Regs & VADDR) {
530  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
531  }
532 
533  for (unsigned i = 0; i < NumAddresses; i++) {
534  AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
535  AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
536 
537  // We only ever merge operations with the same base address register, so
538  // don't bother scanning forward if there are no other uses.
539  if (AddrReg[i]->isReg() &&
541  MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
542  return false;
543  }
544 
545  ++MBBI;
546 
547  DenseSet<unsigned> RegDefsToMove;
548  DenseSet<unsigned> PhysRegUsesToMove;
549  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
550 
551  for (; MBBI != E; ++MBBI) {
552  const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
553 
554  if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
555  (IsDS && (MBBI->getOpcode() != Opc))) {
556  // This is not a matching DS instruction, but we can keep looking as
557  // long as one of these conditions are met:
558  // 1. It is safe to move I down past MBBI.
559  // 2. It is safe to move MBBI down past the instruction that I will
560  // be merged into.
561 
562  if (MBBI->hasUnmodeledSideEffects()) {
563  // We can't re-order this instruction with respect to other memory
564  // operations, so we fail both conditions mentioned above.
565  return false;
566  }
567 
568  if (MBBI->mayLoadOrStore() &&
569  (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
570  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
571  // We fail condition #1, but we may still be able to satisfy condition
572  // #2. Add this instruction to the move list and then we will check
573  // if condition #2 holds once we have selected the matching instruction.
574  CI.InstsToMove.push_back(&*MBBI);
575  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
576  continue;
577  }
578 
579  // When we match I with another DS instruction we will be moving I down
580  // to the location of the matched instruction any uses of I will need to
581  // be moved down as well.
582  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
583  CI.InstsToMove);
584  continue;
585  }
586 
587  // Don't merge volatiles.
588  if (MBBI->hasOrderedMemoryRef())
589  return false;
590 
591  // Handle a case like
592  // DS_WRITE_B32 addr, v, idx0
593  // w = DS_READ_B32 addr, idx0
594  // DS_WRITE_B32 addr, f(w), idx1
595  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
596  // merging of the two writes.
597  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
598  CI.InstsToMove))
599  continue;
600 
601  bool Match = true;
602  for (unsigned i = 0; i < NumAddresses; i++) {
603  const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
604 
605  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
606  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
607  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
608  Match = false;
609  break;
610  }
611  continue;
612  }
613 
614  // Check same base pointer. Be careful of subregisters, which can occur
615  // with vectors of pointers.
616  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
617  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
618  Match = false;
619  break;
620  }
621  }
622 
623  if (Match) {
624  int OffsetIdx =
625  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
626  CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
627  CI.Width0 = getOpcodeWidth(*CI.I);
628  CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
629  CI.Width1 = getOpcodeWidth(*MBBI);
630  CI.Paired = MBBI;
631 
632  if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
633  CI.Offset0 &= 0xffff;
634  CI.Offset1 &= 0xffff;
635  } else {
636  CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
637  CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
638  if (CI.InstClass != S_BUFFER_LOAD_IMM) {
639  CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
640  CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
641  }
642  }
643 
644  // Check both offsets fit in the reduced range.
645  // We also need to go through the list of instructions that we plan to
646  // move and make sure they are all safe to move down past the merged
647  // instruction.
648  if (widthsFit(CI) && offsetsCanBeCombined(CI))
649  if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
650  return true;
651  }
652 
653  // We've found a load/store that we couldn't merge for some reason.
654  // We could potentially keep looking, but we'd need to make sure that
655  // it was safe to move I and also all the instruction in InstsToMove
656  // down past this instruction.
657  // check if we can move I across MBBI and if we can move all I's users
658  if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
659  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
660  break;
661  }
662  return false;
663 }
664 
665 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
666  if (STM->ldsRequiresM0Init())
667  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
668  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
669 }
670 
671 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
672  if (STM->ldsRequiresM0Init())
673  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
674 
675  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
676  : AMDGPU::DS_READ2ST64_B64_gfx9;
677 }
678 
680 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
681  MachineBasicBlock *MBB = CI.I->getParent();
682 
683  // Be careful, since the addresses could be subregisters themselves in weird
684  // cases, like vectors of pointers.
685  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
686 
687  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
688  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
689 
690  unsigned NewOffset0 = CI.Offset0;
691  unsigned NewOffset1 = CI.Offset1;
692  unsigned Opc =
693  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
694 
695  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
696  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
697 
698  if (NewOffset0 > NewOffset1) {
699  // Canonicalize the merged instruction so the smaller offset comes first.
700  std::swap(NewOffset0, NewOffset1);
701  std::swap(SubRegIdx0, SubRegIdx1);
702  }
703 
704  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
705  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
706 
707  const MCInstrDesc &Read2Desc = TII->get(Opc);
708 
709  const TargetRegisterClass *SuperRC =
710  (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
711  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
712 
713  DebugLoc DL = CI.I->getDebugLoc();
714 
715  unsigned BaseReg = AddrReg->getReg();
716  unsigned BaseSubReg = AddrReg->getSubReg();
717  unsigned BaseRegFlags = 0;
718  if (CI.BaseOff) {
719  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
720  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
721  .addImm(CI.BaseOff);
722 
723  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
724  BaseRegFlags = RegState::Kill;
725 
726  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
727  .addReg(ImmReg)
728  .addReg(AddrReg->getReg(), 0, BaseSubReg);
729  BaseSubReg = 0;
730  }
731 
732  MachineInstrBuilder Read2 =
733  BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
734  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
735  .addImm(NewOffset0) // offset0
736  .addImm(NewOffset1) // offset1
737  .addImm(0) // gds
738  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
739 
740  (void)Read2;
741 
742  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
743 
744  // Copy to the old destination registers.
745  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
746  .add(*Dest0) // Copy to same destination including flags and sub reg.
747  .addReg(DestReg, 0, SubRegIdx0);
748  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
749  .add(*Dest1)
750  .addReg(DestReg, RegState::Kill, SubRegIdx1);
751 
752  moveInstsAfter(Copy1, CI.InstsToMove);
753 
754  MachineBasicBlock::iterator Next = std::next(CI.I);
755  CI.I->eraseFromParent();
756  CI.Paired->eraseFromParent();
757 
758  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
759  return Next;
760 }
761 
762 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
763  if (STM->ldsRequiresM0Init())
764  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
765  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
766  : AMDGPU::DS_WRITE2_B64_gfx9;
767 }
768 
769 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
770  if (STM->ldsRequiresM0Init())
771  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
772  : AMDGPU::DS_WRITE2ST64_B64;
773 
774  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
775  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
776 }
777 
779 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
780  MachineBasicBlock *MBB = CI.I->getParent();
781 
782  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
783  // sure we preserve the subregister index and any register flags set on them.
784  const MachineOperand *AddrReg =
785  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
786  const MachineOperand *Data0 =
787  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
788  const MachineOperand *Data1 =
789  TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
790 
791  unsigned NewOffset0 = CI.Offset0;
792  unsigned NewOffset1 = CI.Offset1;
793  unsigned Opc =
794  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
795 
796  if (NewOffset0 > NewOffset1) {
797  // Canonicalize the merged instruction so the smaller offset comes first.
798  std::swap(NewOffset0, NewOffset1);
799  std::swap(Data0, Data1);
800  }
801 
802  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
803  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
804 
805  const MCInstrDesc &Write2Desc = TII->get(Opc);
806  DebugLoc DL = CI.I->getDebugLoc();
807 
808  unsigned BaseReg = AddrReg->getReg();
809  unsigned BaseSubReg = AddrReg->getSubReg();
810  unsigned BaseRegFlags = 0;
811  if (CI.BaseOff) {
812  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
813  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
814  .addImm(CI.BaseOff);
815 
816  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
817  BaseRegFlags = RegState::Kill;
818 
819  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
820  .addReg(ImmReg)
821  .addReg(AddrReg->getReg(), 0, BaseSubReg);
822  BaseSubReg = 0;
823  }
824 
825  MachineInstrBuilder Write2 =
826  BuildMI(*MBB, CI.Paired, DL, Write2Desc)
827  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
828  .add(*Data0) // data0
829  .add(*Data1) // data1
830  .addImm(NewOffset0) // offset0
831  .addImm(NewOffset1) // offset1
832  .addImm(0) // gds
833  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
834 
835  moveInstsAfter(Write2, CI.InstsToMove);
836 
837  MachineBasicBlock::iterator Next = std::next(CI.I);
838  CI.I->eraseFromParent();
839  CI.Paired->eraseFromParent();
840 
841  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
842  return Next;
843 }
844 
846 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
847  MachineBasicBlock *MBB = CI.I->getParent();
848  DebugLoc DL = CI.I->getDebugLoc();
849  const unsigned Opcode = getNewOpcode(CI);
850 
851  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
852 
853  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
854  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
855 
856  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
857  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
858  .addImm(MergedOffset) // offset
859  .addImm(CI.GLC0) // glc
860  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
861 
862  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
863  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
864  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
865 
866  // Copy to the old destination registers.
867  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
868  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
869  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
870 
871  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
872  .add(*Dest0) // Copy to same destination including flags and sub reg.
873  .addReg(DestReg, 0, SubRegIdx0);
874  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
875  .add(*Dest1)
876  .addReg(DestReg, RegState::Kill, SubRegIdx1);
877 
878  moveInstsAfter(Copy1, CI.InstsToMove);
879 
880  MachineBasicBlock::iterator Next = std::next(CI.I);
881  CI.I->eraseFromParent();
882  CI.Paired->eraseFromParent();
883  return Next;
884 }
885 
887 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
888  MachineBasicBlock *MBB = CI.I->getParent();
889  DebugLoc DL = CI.I->getDebugLoc();
890 
891  const unsigned Opcode = getNewOpcode(CI);
892 
893  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
894 
895  // Copy to the new source register.
896  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
897  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
898 
899  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
900 
901  const unsigned Regs = getRegs(Opcode);
902 
903  if (Regs & VADDR)
904  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
905 
906  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
907  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
908  .addImm(MergedOffset) // offset
909  .addImm(CI.GLC0) // glc
910  .addImm(CI.SLC0) // slc
911  .addImm(0) // tfe
912  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
913 
914  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
915  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
916  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
917 
918  // Copy to the old destination registers.
919  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
920  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
921  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
922 
923  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
924  .add(*Dest0) // Copy to same destination including flags and sub reg.
925  .addReg(DestReg, 0, SubRegIdx0);
926  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
927  .add(*Dest1)
928  .addReg(DestReg, RegState::Kill, SubRegIdx1);
929 
930  moveInstsAfter(Copy1, CI.InstsToMove);
931 
932  MachineBasicBlock::iterator Next = std::next(CI.I);
933  CI.I->eraseFromParent();
934  CI.Paired->eraseFromParent();
935  return Next;
936 }
937 
938 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
939  const unsigned Width = CI.Width0 + CI.Width1;
940 
941  switch (CI.InstClass) {
942  default:
943  return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
944  case UNKNOWN:
945  llvm_unreachable("Unknown instruction class");
946  case S_BUFFER_LOAD_IMM:
947  switch (Width) {
948  default:
949  return 0;
950  case 2:
951  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
952  case 4:
953  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
954  }
955  }
956 }
957 
958 std::pair<unsigned, unsigned>
959 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
960  if (CI.Offset0 > CI.Offset1) {
961  switch (CI.Width0) {
962  default:
963  return std::make_pair(0, 0);
964  case 1:
965  switch (CI.Width1) {
966  default:
967  return std::make_pair(0, 0);
968  case 1:
969  return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
970  case 2:
971  return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
972  case 3:
973  return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
974  }
975  case 2:
976  switch (CI.Width1) {
977  default:
978  return std::make_pair(0, 0);
979  case 1:
980  return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
981  case 2:
982  return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
983  }
984  case 3:
985  switch (CI.Width1) {
986  default:
987  return std::make_pair(0, 0);
988  case 1:
989  return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
990  }
991  }
992  } else {
993  switch (CI.Width0) {
994  default:
995  return std::make_pair(0, 0);
996  case 1:
997  switch (CI.Width1) {
998  default:
999  return std::make_pair(0, 0);
1000  case 1:
1001  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1002  case 2:
1003  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1004  case 3:
1005  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1006  }
1007  case 2:
1008  switch (CI.Width1) {
1009  default:
1010  return std::make_pair(0, 0);
1011  case 1:
1012  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1013  case 2:
1014  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1015  }
1016  case 3:
1017  switch (CI.Width1) {
1018  default:
1019  return std::make_pair(0, 0);
1020  case 1:
1021  return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1022  }
1023  }
1024  }
1025 }
1026 
1027 const TargetRegisterClass *
1028 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1029  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1030  switch (CI.Width0 + CI.Width1) {
1031  default:
1032  return nullptr;
1033  case 2:
1034  return &AMDGPU::SReg_64_XEXECRegClass;
1035  case 4:
1036  return &AMDGPU::SReg_128RegClass;
1037  case 8:
1038  return &AMDGPU::SReg_256RegClass;
1039  case 16:
1040  return &AMDGPU::SReg_512RegClass;
1041  }
1042  } else {
1043  switch (CI.Width0 + CI.Width1) {
1044  default:
1045  return nullptr;
1046  case 2:
1047  return &AMDGPU::VReg_64RegClass;
1048  case 3:
1049  return &AMDGPU::VReg_96RegClass;
1050  case 4:
1051  return &AMDGPU::VReg_128RegClass;
1052  }
1053  }
1054 }
1055 
1057 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1058  MachineBasicBlock *MBB = CI.I->getParent();
1059  DebugLoc DL = CI.I->getDebugLoc();
1060 
1061  const unsigned Opcode = getNewOpcode(CI);
1062 
1063  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1064  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1065  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1066 
1067  // Copy to the new source register.
1068  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1069  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
1070 
1071  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1072  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1073 
1074  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1075  .add(*Src0)
1076  .addImm(SubRegIdx0)
1077  .add(*Src1)
1078  .addImm(SubRegIdx1);
1079 
1080  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1081  .addReg(SrcReg, RegState::Kill);
1082 
1083  const unsigned Regs = getRegs(Opcode);
1084 
1085  if (Regs & VADDR)
1086  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1087 
1088  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1089  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1090  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1091  .addImm(CI.GLC0) // glc
1092  .addImm(CI.SLC0) // slc
1093  .addImm(0) // tfe
1094  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1095 
1096  moveInstsAfter(MIB, CI.InstsToMove);
1097 
1098  MachineBasicBlock::iterator Next = std::next(CI.I);
1099  CI.I->eraseFromParent();
1100  CI.Paired->eraseFromParent();
1101  return Next;
1102 }
1103 
1105 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1106  APInt V(32, Val, true);
1107  if (TII->isInlineConstant(V))
1108  return MachineOperand::CreateImm(Val);
1109 
1110  unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1111  MachineInstr *Mov =
1112  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1113  TII->get(AMDGPU::S_MOV_B32), Reg)
1114  .addImm(Val);
1115  (void)Mov;
1116  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1117  return MachineOperand::CreateReg(Reg, false);
1118 }
1119 
1120 // Compute base address using Addr and return the final register.
1121 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1122  const MemAddress &Addr) {
1123  MachineBasicBlock *MBB = MI.getParent();
1125  DebugLoc DL = MI.getDebugLoc();
1126 
1127  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1128  Addr.Base.LoSubReg) &&
1129  "Expected 32-bit Base-Register-Low!!");
1130 
1131  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1132  Addr.Base.HiSubReg) &&
1133  "Expected 32-bit Base-Register-Hi!!");
1134 
1135  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1136  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1137  MachineOperand OffsetHi =
1138  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1139  unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1140  unsigned DeadCarryReg =
1141  MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1142 
1143  unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1144  unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1145  MachineInstr *LoHalf =
1146  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1147  .addReg(CarryReg, RegState::Define)
1148  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1149  .add(OffsetLo);
1150  (void)LoHalf;
1151  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1152 
1153  MachineInstr *HiHalf =
1154  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1155  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1156  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1157  .add(OffsetHi)
1158  .addReg(CarryReg, RegState::Kill);
1159  (void)HiHalf;
1160  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1161 
1162  unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1163  MachineInstr *FullBase =
1164  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1165  .addReg(DestSub0)
1166  .addImm(AMDGPU::sub0)
1167  .addReg(DestSub1)
1168  .addImm(AMDGPU::sub1);
1169  (void)FullBase;
1170  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1171 
1172  return FullDestReg;
1173 }
1174 
1175 // Update base and offset with the NewBase and NewOffset in MI.
1176 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1177  unsigned NewBase,
1178  int32_t NewOffset) {
1179  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1180  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1181 }
1182 
1184 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1185  if (Op.isImm())
1186  return Op.getImm();
1187 
1188  if (!Op.isReg())
1189  return None;
1190 
1191  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1192  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1193  !Def->getOperand(1).isImm())
1194  return None;
1195 
1196  return Def->getOperand(1).getImm();
1197 }
1198 
1199 // Analyze Base and extracts:
1200 // - 32bit base registers, subregisters
1201 // - 64bit constant offset
1202 // Expecting base computation as:
1203 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1204 // %LO:vgpr_32, %c:sreg_64_xexec =
1205 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1206 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1207 // %Base:vreg_64 =
1208 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1209 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1210  MemAddress &Addr) {
1211  if (!Base.isReg())
1212  return;
1213 
1214  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1215  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1216  || Def->getNumOperands() != 5)
1217  return;
1218 
1219  MachineOperand BaseLo = Def->getOperand(1);
1220  MachineOperand BaseHi = Def->getOperand(3);
1221  if (!BaseLo.isReg() || !BaseHi.isReg())
1222  return;
1223 
1224  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1225  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1226 
1227  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1228  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1229  return;
1230 
1231  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1232  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1233 
1234  auto Offset0P = extractConstOffset(*Src0);
1235  if (Offset0P)
1236  BaseLo = *Src1;
1237  else {
1238  if (!(Offset0P = extractConstOffset(*Src1)))
1239  return;
1240  BaseLo = *Src0;
1241  }
1242 
1243  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1244  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1245 
1246  if (Src0->isImm())
1247  std::swap(Src0, Src1);
1248 
1249  if (!Src1->isImm())
1250  return;
1251 
1252  assert(isInt<32>(*Offset0P) && isInt<32>(Src1->getImm())
1253  && "Expected 32bit immediate!!!");
1254  uint64_t Offset1 = Src1->getImm();
1255  BaseHi = *Src0;
1256 
1257  Addr.Base.LoReg = BaseLo.getReg();
1258  Addr.Base.HiReg = BaseHi.getReg();
1259  Addr.Base.LoSubReg = BaseLo.getSubReg();
1260  Addr.Base.HiSubReg = BaseHi.getSubReg();
1261  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1262 }
1263 
1264 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1265  MachineInstr &MI,
1266  MemInfoMap &Visited,
1267  SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1268 
1269  // TODO: Support flat and scratch.
1270  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1271  TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1272  return false;
1273 
1274  // TODO: Support Store.
1275  if (!MI.mayLoad())
1276  return false;
1277 
1278  if (AnchorList.count(&MI))
1279  return false;
1280 
1281  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1282 
1283  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1284  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1285  return false;
1286  }
1287 
1288  // Step1: Find the base-registers and a 64bit constant offset.
1289  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1290  MemAddress MAddr;
1291  if (Visited.find(&MI) == Visited.end()) {
1292  processBaseWithConstOffset(Base, MAddr);
1293  Visited[&MI] = MAddr;
1294  } else
1295  MAddr = Visited[&MI];
1296 
1297  if (MAddr.Offset == 0) {
1298  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1299  " constant offsets that can be promoted.\n";);
1300  return false;
1301  }
1302 
1303  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1304  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1305 
1306  // Step2: Traverse through MI's basic block and find an anchor(that has the
1307  // same base-registers) with the highest 13bit distance from MI's offset.
1308  // E.g. (64bit loads)
1309  // bb:
1310  // addr1 = &a + 4096; load1 = load(addr1, 0)
1311  // addr2 = &a + 6144; load2 = load(addr2, 0)
1312  // addr3 = &a + 8192; load3 = load(addr3, 0)
1313  // addr4 = &a + 10240; load4 = load(addr4, 0)
1314  // addr5 = &a + 12288; load5 = load(addr5, 0)
1315  //
1316  // Starting from the first load, the optimization will try to find a new base
1317  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1318  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1319  // as the new-base(anchor) because of the maximum distance which can
1320  // accomodate more intermediate bases presumeably.
1321  //
1322  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1323  // (&a + 8192) for load1, load2, load4.
1324  // addr = &a + 8192
1325  // load1 = load(addr, -4096)
1326  // load2 = load(addr, -2048)
1327  // load3 = load(addr, 0)
1328  // load4 = load(addr, 2048)
1329  // addr5 = &a + 12288; load5 = load(addr5, 0)
1330  //
1331  MachineInstr *AnchorInst = nullptr;
1332  MemAddress AnchorAddr;
1333  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1335 
1336  MachineBasicBlock *MBB = MI.getParent();
1339  ++MBBI;
1340  const SITargetLowering *TLI =
1341  static_cast<const SITargetLowering *>(STM->getTargetLowering());
1342 
1343  for ( ; MBBI != E; ++MBBI) {
1344  MachineInstr &MINext = *MBBI;
1345  // TODO: Support finding an anchor(with same base) from store addresses or
1346  // any other load addresses where the opcodes are different.
1347  if (MINext.getOpcode() != MI.getOpcode() ||
1348  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1349  continue;
1350 
1351  const MachineOperand &BaseNext =
1352  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1353  MemAddress MAddrNext;
1354  if (Visited.find(&MINext) == Visited.end()) {
1355  processBaseWithConstOffset(BaseNext, MAddrNext);
1356  Visited[&MINext] = MAddrNext;
1357  } else
1358  MAddrNext = Visited[&MINext];
1359 
1360  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1361  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1362  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1363  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1364  continue;
1365 
1366  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1367 
1368  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1370  AM.HasBaseReg = true;
1371  AM.BaseOffs = Dist;
1372  if (TLI->isLegalGlobalAddressingMode(AM) &&
1373  (uint32_t)std::abs(Dist) > MaxDist) {
1374  MaxDist = std::abs(Dist);
1375 
1376  AnchorAddr = MAddrNext;
1377  AnchorInst = &MINext;
1378  }
1379  }
1380 
1381  if (AnchorInst) {
1382  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1383  AnchorInst->dump());
1384  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1385  << AnchorAddr.Offset << "\n\n");
1386 
1387  // Instead of moving up, just re-compute anchor-instruction's base address.
1388  unsigned Base = computeBase(MI, AnchorAddr);
1389 
1390  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1391  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1392 
1393  for (auto P : InstsWCommonBase) {
1395  AM.HasBaseReg = true;
1396  AM.BaseOffs = P.second - AnchorAddr.Offset;
1397 
1398  if (TLI->isLegalGlobalAddressingMode(AM)) {
1399  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1400  dbgs() << ")"; P.first->dump());
1401  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1402  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1403  }
1404  }
1405  AnchorList.insert(AnchorInst);
1406  return true;
1407  }
1408 
1409  return false;
1410 }
1411 
1412 // Scan through looking for adjacent LDS operations with constant offsets from
1413 // the same base register. We rely on the scheduler to do the hard work of
1414 // clustering nearby loads, and assume these are all adjacent.
1415 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
1416  bool Modified = false;
1417 
1418  // Contain the list
1419  MemInfoMap Visited;
1420  // Contains the list of instructions for which constant offsets are being
1421  // promoted to the IMM.
1422  SmallPtrSet<MachineInstr *, 4> AnchorList;
1423 
1424  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1425  MachineInstr &MI = *I;
1426 
1427  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1428  Modified = true;
1429 
1430  // Don't combine if volatile.
1431  if (MI.hasOrderedMemoryRef()) {
1432  ++I;
1433  continue;
1434  }
1435 
1436  const unsigned Opc = MI.getOpcode();
1437 
1438  CombineInfo CI;
1439  CI.I = I;
1440  CI.InstClass = getInstClass(Opc);
1441 
1442  switch (CI.InstClass) {
1443  default:
1444  break;
1445  case DS_READ:
1446  CI.EltSize =
1447  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1448  : 4;
1449  if (findMatchingInst(CI)) {
1450  Modified = true;
1451  I = mergeRead2Pair(CI);
1452  } else {
1453  ++I;
1454  }
1455  continue;
1456  case DS_WRITE:
1457  CI.EltSize =
1458  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1459  : 4;
1460  if (findMatchingInst(CI)) {
1461  Modified = true;
1462  I = mergeWrite2Pair(CI);
1463  } else {
1464  ++I;
1465  }
1466  continue;
1467  case S_BUFFER_LOAD_IMM:
1468  CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
1469  if (findMatchingInst(CI)) {
1470  Modified = true;
1471  I = mergeSBufferLoadImmPair(CI);
1472  OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
1473  } else {
1474  ++I;
1475  }
1476  continue;
1477  case BUFFER_LOAD_OFFEN:
1478  case BUFFER_LOAD_OFFSET:
1479  case BUFFER_LOAD_OFFEN_exact:
1480  case BUFFER_LOAD_OFFSET_exact:
1481  CI.EltSize = 4;
1482  if (findMatchingInst(CI)) {
1483  Modified = true;
1484  I = mergeBufferLoadPair(CI);
1485  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1486  } else {
1487  ++I;
1488  }
1489  continue;
1490  case BUFFER_STORE_OFFEN:
1491  case BUFFER_STORE_OFFSET:
1492  case BUFFER_STORE_OFFEN_exact:
1493  case BUFFER_STORE_OFFSET_exact:
1494  CI.EltSize = 4;
1495  if (findMatchingInst(CI)) {
1496  Modified = true;
1497  I = mergeBufferStorePair(CI);
1498  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1499  } else {
1500  ++I;
1501  }
1502  continue;
1503  }
1504 
1505  ++I;
1506  }
1507 
1508  return Modified;
1509 }
1510 
1511 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1512  if (skipFunction(MF.getFunction()))
1513  return false;
1514 
1515  STM = &MF.getSubtarget<GCNSubtarget>();
1516  if (!STM->loadStoreOptEnabled())
1517  return false;
1518 
1519  TII = STM->getInstrInfo();
1520  TRI = &TII->getRegisterInfo();
1521 
1522  MRI = &MF.getRegInfo();
1523  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1524 
1525  assert(MRI->isSSA() && "Must be run on SSA");
1526 
1527  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1528 
1529  bool Modified = false;
1530 
1531  for (MachineBasicBlock &MBB : MF) {
1532  do {
1533  OptimizeAgain = false;
1534  Modified |= optimizeBlock(MBB);
1535  } while (OptimizeAgain);
1536  }
1537 
1538  return Modified;
1539 }
static bool isReg(const MCInst &MI, unsigned OpNo)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords)
const MachineInstrBuilder & add(const MachineOperand &MO) const
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Interface definition for SIRegisterInfo.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
bool getMUBUFHasSrsrc(unsigned Opc)
This class represents lattice values for constants.
Definition: AllocatorList.h:24
char & SILoadStoreOptimizerID
void push_back(const T &Elt)
Definition: SmallVector.h:218
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:383
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:164
unsigned getReg() const
getReg - Returns the register number.
unsigned Reg
unsigned getSubReg() const
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:830
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
int getMUBUFDwords(unsigned Opc)
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:459
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateReg(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, AliasAnalysis *AA)
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:412
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
#define P(N)
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:343
unsigned const MachineRegisterInfo * MRI
bool getMUBUFHasSoffset(unsigned Opc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:188
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
int getMUBUFBaseOpcode(unsigned Opc)
Represent the analysis usage information of a pass.
SI Load Store Optimizer
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
self_iterator getIterator()
Definition: ilist_node.h:82
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:309
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA=nullptr) const override
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:941
Class for arbitrary precision integers.
Definition: APInt.h:70
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:254
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static MachineOperand CreateImm(int64_t Val)
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1213
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, const SIInstrInfo *TII, AliasAnalysis *AA)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:92
bool isReg() const
isReg - Tests if this is a MO_Register operand.
FunctionPass * createSILoadStoreOptimizerPass()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:807
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
bool getMUBUFHasVAddr(unsigned Opc)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
#define LLVM_DEBUG(X)
Definition: Debug.h:123
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:414
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)