LLVM  9.0.0svn
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This currently relies on the scheduler to place loads and stores next to
46 // each other, and then only merges adjacent pairs of instructions. It would
47 // be good to be more flexible with interleaved instructions, and possibly run
48 // before scheduling. It currently missing stores of constants because loading
49 // the constant into the data register is placed between the stores, although
50 // this is arguably a scheduling problem.
51 //
52 // - Live interval recomputing seems inefficient. This currently only matches
53 // one pair, and recomputes live intervals and moves on to the next pair. It
54 // would be better to compute a list of all merges that need to occur.
55 //
56 // - With a list of instructions to process, we can also merge more. If a
57 // cluster of loads have offsets that are too large to fit in the 8-bit
58 // offsets, but are close enough to fit in the 8 bits, we can add to the base
59 // pointer and use the new reduced offsets.
60 //
61 //===----------------------------------------------------------------------===//
62 
63 #include "AMDGPU.h"
64 #include "AMDGPUSubtarget.h"
66 #include "SIInstrInfo.h"
67 #include "SIRegisterInfo.h"
68 #include "Utils/AMDGPUBaseInfo.h"
69 #include "llvm/ADT/ArrayRef.h"
70 #include "llvm/ADT/SmallVector.h"
71 #include "llvm/ADT/StringRef.h"
80 #include "llvm/IR/DebugLoc.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
85 #include <algorithm>
86 #include <cassert>
87 #include <cstdlib>
88 #include <iterator>
89 #include <utility>
90 
91 using namespace llvm;
92 
93 #define DEBUG_TYPE "si-load-store-opt"
94 
95 namespace {
97  UNKNOWN,
98  DS_READ,
99  DS_WRITE,
100  S_BUFFER_LOAD_IMM,
101  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
102  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
103  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
104  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
105  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
106  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
107  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
108  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
109 };
110 
112  SBASE = 0x1,
113  SRSRC = 0x2,
114  SOFFSET = 0x4,
115  VADDR = 0x8,
116  ADDR = 0x10,
117 };
118 
119 class SILoadStoreOptimizer : public MachineFunctionPass {
120  struct CombineInfo {
123  unsigned EltSize;
124  unsigned Offset0;
125  unsigned Offset1;
126  unsigned Width0;
127  unsigned Width1;
128  unsigned BaseOff;
129  InstClassEnum InstClass;
130  bool GLC0;
131  bool GLC1;
132  bool SLC0;
133  bool SLC1;
134  bool DLC0;
135  bool DLC1;
136  bool UseST64;
137  SmallVector<MachineInstr *, 8> InstsToMove;
138  };
139 
140  struct BaseRegisters {
141  unsigned LoReg = 0;
142  unsigned HiReg = 0;
143 
144  unsigned LoSubReg = 0;
145  unsigned HiSubReg = 0;
146  };
147 
148  struct MemAddress {
149  BaseRegisters Base;
150  int64_t Offset = 0;
151  };
152 
153  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
154 
155 private:
156  const GCNSubtarget *STM = nullptr;
157  const SIInstrInfo *TII = nullptr;
158  const SIRegisterInfo *TRI = nullptr;
159  MachineRegisterInfo *MRI = nullptr;
160  AliasAnalysis *AA = nullptr;
161  bool OptimizeAgain;
162 
163  static bool offsetsCanBeCombined(CombineInfo &CI);
164  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
165  static unsigned getNewOpcode(const CombineInfo &CI);
166  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
167  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
168  unsigned getOpcodeWidth(const MachineInstr &MI);
169  InstClassEnum getInstClass(unsigned Opc);
170  unsigned getRegs(unsigned Opc);
171 
172  bool findMatchingInst(CombineInfo &CI);
173 
174  unsigned read2Opcode(unsigned EltSize) const;
175  unsigned read2ST64Opcode(unsigned EltSize) const;
176  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
177 
178  unsigned write2Opcode(unsigned EltSize) const;
179  unsigned write2ST64Opcode(unsigned EltSize) const;
180  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
181  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
182  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
183  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
184 
185  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
186  int32_t NewOffset);
187  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
188  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
189  Optional<int32_t> extractConstOffset(const MachineOperand &Op);
190  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
191  /// Promotes constant offset to the immediate by adjusting the base. It
192  /// tries to use a base from the nearby instructions that allows it to have
193  /// a 13bit constant offset which gets promoted to the immediate.
194  bool promoteConstantOffsetToImm(MachineInstr &CI,
195  MemInfoMap &Visited,
197 
198 public:
199  static char ID;
200 
201  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
203  }
204 
205  bool optimizeBlock(MachineBasicBlock &MBB);
206 
207  bool runOnMachineFunction(MachineFunction &MF) override;
208 
209  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
210 
211  void getAnalysisUsage(AnalysisUsage &AU) const override {
212  AU.setPreservesCFG();
214 
216  }
217 };
218 
219 } // end anonymous namespace.
220 
221 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
222  "SI Load Store Optimizer", false, false)
225  false, false)
226 
227 char SILoadStoreOptimizer::ID = 0;
228 
229 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
230 
232  return new SILoadStoreOptimizer();
233 }
234 
236  ArrayRef<MachineInstr *> InstsToMove) {
237  MachineBasicBlock *MBB = I->getParent();
238  ++I;
239  for (MachineInstr *MI : InstsToMove) {
240  MI->removeFromParent();
241  MBB->insert(I, MI);
242  }
243 }
244 
245 static void addDefsUsesToList(const MachineInstr &MI,
246  DenseSet<unsigned> &RegDefs,
247  DenseSet<unsigned> &PhysRegUses) {
248  for (const MachineOperand &Op : MI.operands()) {
249  if (Op.isReg()) {
250  if (Op.isDef())
251  RegDefs.insert(Op.getReg());
252  else if (Op.readsReg() &&
254  PhysRegUses.insert(Op.getReg());
255  }
256  }
257 }
258 
261  AliasAnalysis *AA) {
262  // RAW or WAR - cannot reorder
263  // WAW - cannot reorder
264  // RAR - safe to reorder
265  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
266 }
267 
268 // Add MI and its defs to the lists if MI reads one of the defs that are
269 // already in the list. Returns true in that case.
271  DenseSet<unsigned> &PhysRegUses,
273  for (MachineOperand &Use : MI.operands()) {
274  // If one of the defs is read, then there is a use of Def between I and the
275  // instruction that I will potentially be merged with. We will need to move
276  // this instruction after the merged instructions.
277  //
278  // Similarly, if there is a def which is read by an instruction that is to
279  // be moved for merging, then we need to move the def-instruction as well.
280  // This can only happen for physical registers such as M0; virtual
281  // registers are in SSA form.
282  if (Use.isReg() &&
283  ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
284  (Use.isDef() && RegDefs.count(Use.getReg())) ||
285  (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
286  PhysRegUses.count(Use.getReg())))) {
287  Insts.push_back(&MI);
288  addDefsUsesToList(MI, RegDefs, PhysRegUses);
289  return true;
290  }
291  }
292 
293  return false;
294 }
295 
297  ArrayRef<MachineInstr *> InstsToMove,
298  AliasAnalysis *AA) {
299  assert(MemOp.mayLoadOrStore());
300 
301  for (MachineInstr *InstToMove : InstsToMove) {
302  if (!InstToMove->mayLoadOrStore())
303  continue;
304  if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
305  return false;
306  }
307  return true;
308 }
309 
310 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
311  // XXX - Would the same offset be OK? Is there any reason this would happen or
312  // be useful?
313  if (CI.Offset0 == CI.Offset1)
314  return false;
315 
316  // This won't be valid if the offset isn't aligned.
317  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
318  return false;
319 
320  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
321  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
322  CI.UseST64 = false;
323  CI.BaseOff = 0;
324 
325  // Handle SMEM and VMEM instructions.
326  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
327  return (EltOffset0 + CI.Width0 == EltOffset1 ||
328  EltOffset1 + CI.Width1 == EltOffset0) &&
329  CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
330  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
331  }
332 
333  // If the offset in elements doesn't fit in 8-bits, we might be able to use
334  // the stride 64 versions.
335  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
336  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
337  CI.Offset0 = EltOffset0 / 64;
338  CI.Offset1 = EltOffset1 / 64;
339  CI.UseST64 = true;
340  return true;
341  }
342 
343  // Check if the new offsets fit in the reduced 8-bit range.
344  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
345  CI.Offset0 = EltOffset0;
346  CI.Offset1 = EltOffset1;
347  return true;
348  }
349 
350  // Try to shift base address to decrease offsets.
351  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
352  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
353 
354  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
355  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
356  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
357  CI.UseST64 = true;
358  return true;
359  }
360 
361  if (isUInt<8>(OffsetDiff)) {
362  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
363  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
364  return true;
365  }
366 
367  return false;
368 }
369 
370 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
371  const CombineInfo &CI) {
372  const unsigned Width = (CI.Width0 + CI.Width1);
373  switch (CI.InstClass) {
374  default:
375  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
376  case S_BUFFER_LOAD_IMM:
377  switch (Width) {
378  default:
379  return false;
380  case 2:
381  case 4:
382  return true;
383  }
384  }
385 }
386 
387 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
388  const unsigned Opc = MI.getOpcode();
389 
390  if (TII->isMUBUF(MI)) {
391  return AMDGPU::getMUBUFDwords(Opc);
392  }
393 
394  switch (Opc) {
395  default:
396  return 0;
397  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
398  return 1;
399  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
400  return 2;
401  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
402  return 4;
403  }
404 }
405 
406 InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
407  if (TII->isMUBUF(Opc)) {
408  const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
409 
410  // If we couldn't identify the opcode, bail out.
411  if (baseOpcode == -1) {
412  return UNKNOWN;
413  }
414 
415  switch (baseOpcode) {
416  default:
417  return UNKNOWN;
418  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
419  return BUFFER_LOAD_OFFEN;
420  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
421  return BUFFER_LOAD_OFFSET;
422  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
423  return BUFFER_STORE_OFFEN;
424  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
425  return BUFFER_STORE_OFFSET;
426  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
427  return BUFFER_LOAD_OFFEN_exact;
428  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
429  return BUFFER_LOAD_OFFSET_exact;
430  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
431  return BUFFER_STORE_OFFEN_exact;
432  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
433  return BUFFER_STORE_OFFSET_exact;
434  }
435  }
436 
437  switch (Opc) {
438  default:
439  return UNKNOWN;
440  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
441  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
442  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
443  return S_BUFFER_LOAD_IMM;
444  case AMDGPU::DS_READ_B32:
445  case AMDGPU::DS_READ_B64:
446  case AMDGPU::DS_READ_B32_gfx9:
447  case AMDGPU::DS_READ_B64_gfx9:
448  return DS_READ;
449  case AMDGPU::DS_WRITE_B32:
450  case AMDGPU::DS_WRITE_B64:
451  case AMDGPU::DS_WRITE_B32_gfx9:
452  case AMDGPU::DS_WRITE_B64_gfx9:
453  return DS_WRITE;
454  }
455 }
456 
457 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
458  if (TII->isMUBUF(Opc)) {
459  unsigned result = 0;
460 
461  if (AMDGPU::getMUBUFHasVAddr(Opc)) {
462  result |= VADDR;
463  }
464 
465  if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
466  result |= SRSRC;
467  }
468 
469  if (AMDGPU::getMUBUFHasSoffset(Opc)) {
470  result |= SOFFSET;
471  }
472 
473  return result;
474  }
475 
476  switch (Opc) {
477  default:
478  return 0;
479  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
480  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
481  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
482  return SBASE;
483  case AMDGPU::DS_READ_B32:
484  case AMDGPU::DS_READ_B64:
485  case AMDGPU::DS_READ_B32_gfx9:
486  case AMDGPU::DS_READ_B64_gfx9:
487  case AMDGPU::DS_WRITE_B32:
488  case AMDGPU::DS_WRITE_B64:
489  case AMDGPU::DS_WRITE_B32_gfx9:
490  case AMDGPU::DS_WRITE_B64_gfx9:
491  return ADDR;
492  }
493 }
494 
495 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
496  MachineBasicBlock *MBB = CI.I->getParent();
498  MachineBasicBlock::iterator MBBI = CI.I;
499 
500  const unsigned Opc = CI.I->getOpcode();
501  const InstClassEnum InstClass = getInstClass(Opc);
502 
503  if (InstClass == UNKNOWN) {
504  return false;
505  }
506 
507  const unsigned Regs = getRegs(Opc);
508 
509  unsigned AddrOpName[5] = {0};
510  int AddrIdx[5];
511  const MachineOperand *AddrReg[5];
512  unsigned NumAddresses = 0;
513 
514  if (Regs & ADDR) {
515  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
516  }
517 
518  if (Regs & SBASE) {
519  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
520  }
521 
522  if (Regs & SRSRC) {
523  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
524  }
525 
526  if (Regs & SOFFSET) {
527  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
528  }
529 
530  if (Regs & VADDR) {
531  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
532  }
533 
534  for (unsigned i = 0; i < NumAddresses; i++) {
535  AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
536  AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
537 
538  // We only ever merge operations with the same base address register, so
539  // don't bother scanning forward if there are no other uses.
540  if (AddrReg[i]->isReg() &&
542  MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
543  return false;
544  }
545 
546  ++MBBI;
547 
548  DenseSet<unsigned> RegDefsToMove;
549  DenseSet<unsigned> PhysRegUsesToMove;
550  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
551 
552  for (; MBBI != E; ++MBBI) {
553  const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
554 
555  if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
556  (IsDS && (MBBI->getOpcode() != Opc))) {
557  // This is not a matching DS instruction, but we can keep looking as
558  // long as one of these conditions are met:
559  // 1. It is safe to move I down past MBBI.
560  // 2. It is safe to move MBBI down past the instruction that I will
561  // be merged into.
562 
563  if (MBBI->hasUnmodeledSideEffects()) {
564  // We can't re-order this instruction with respect to other memory
565  // operations, so we fail both conditions mentioned above.
566  return false;
567  }
568 
569  if (MBBI->mayLoadOrStore() &&
570  (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
571  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
572  // We fail condition #1, but we may still be able to satisfy condition
573  // #2. Add this instruction to the move list and then we will check
574  // if condition #2 holds once we have selected the matching instruction.
575  CI.InstsToMove.push_back(&*MBBI);
576  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
577  continue;
578  }
579 
580  // When we match I with another DS instruction we will be moving I down
581  // to the location of the matched instruction any uses of I will need to
582  // be moved down as well.
583  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
584  CI.InstsToMove);
585  continue;
586  }
587 
588  // Don't merge volatiles.
589  if (MBBI->hasOrderedMemoryRef())
590  return false;
591 
592  // Handle a case like
593  // DS_WRITE_B32 addr, v, idx0
594  // w = DS_READ_B32 addr, idx0
595  // DS_WRITE_B32 addr, f(w), idx1
596  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
597  // merging of the two writes.
598  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
599  CI.InstsToMove))
600  continue;
601 
602  bool Match = true;
603  for (unsigned i = 0; i < NumAddresses; i++) {
604  const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
605 
606  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
607  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
608  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
609  Match = false;
610  break;
611  }
612  continue;
613  }
614 
615  // Check same base pointer. Be careful of subregisters, which can occur
616  // with vectors of pointers.
617  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
618  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
619  Match = false;
620  break;
621  }
622  }
623 
624  if (Match) {
625  int OffsetIdx =
626  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
627  CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
628  CI.Width0 = getOpcodeWidth(*CI.I);
629  CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
630  CI.Width1 = getOpcodeWidth(*MBBI);
631  CI.Paired = MBBI;
632 
633  if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
634  CI.Offset0 &= 0xffff;
635  CI.Offset1 &= 0xffff;
636  } else {
637  CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
638  CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
639  if (CI.InstClass != S_BUFFER_LOAD_IMM) {
640  CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
641  CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
642  }
643  CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
644  CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
645  }
646 
647  // Check both offsets fit in the reduced range.
648  // We also need to go through the list of instructions that we plan to
649  // move and make sure they are all safe to move down past the merged
650  // instruction.
651  if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
652  if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
653  return true;
654  }
655 
656  // We've found a load/store that we couldn't merge for some reason.
657  // We could potentially keep looking, but we'd need to make sure that
658  // it was safe to move I and also all the instruction in InstsToMove
659  // down past this instruction.
660  // check if we can move I across MBBI and if we can move all I's users
661  if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
662  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
663  break;
664  }
665  return false;
666 }
667 
668 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
669  if (STM->ldsRequiresM0Init())
670  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
671  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
672 }
673 
674 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
675  if (STM->ldsRequiresM0Init())
676  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
677 
678  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
679  : AMDGPU::DS_READ2ST64_B64_gfx9;
680 }
681 
683 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
684  MachineBasicBlock *MBB = CI.I->getParent();
685 
686  // Be careful, since the addresses could be subregisters themselves in weird
687  // cases, like vectors of pointers.
688  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
689 
690  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
691  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
692 
693  unsigned NewOffset0 = CI.Offset0;
694  unsigned NewOffset1 = CI.Offset1;
695  unsigned Opc =
696  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
697 
698  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
699  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
700 
701  if (NewOffset0 > NewOffset1) {
702  // Canonicalize the merged instruction so the smaller offset comes first.
703  std::swap(NewOffset0, NewOffset1);
704  std::swap(SubRegIdx0, SubRegIdx1);
705  }
706 
707  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
708  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
709 
710  const MCInstrDesc &Read2Desc = TII->get(Opc);
711 
712  const TargetRegisterClass *SuperRC =
713  (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
714  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
715 
716  DebugLoc DL = CI.I->getDebugLoc();
717 
718  unsigned BaseReg = AddrReg->getReg();
719  unsigned BaseSubReg = AddrReg->getSubReg();
720  unsigned BaseRegFlags = 0;
721  if (CI.BaseOff) {
722  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
723  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
724  .addImm(CI.BaseOff);
725 
726  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
727  BaseRegFlags = RegState::Kill;
728 
729  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
730  .addReg(ImmReg)
731  .addReg(AddrReg->getReg(), 0, BaseSubReg)
732  .addImm(0); // clamp bit
733  BaseSubReg = 0;
734  }
735 
736  MachineInstrBuilder Read2 =
737  BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
738  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
739  .addImm(NewOffset0) // offset0
740  .addImm(NewOffset1) // offset1
741  .addImm(0) // gds
742  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
743 
744  (void)Read2;
745 
746  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
747 
748  // Copy to the old destination registers.
749  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
750  .add(*Dest0) // Copy to same destination including flags and sub reg.
751  .addReg(DestReg, 0, SubRegIdx0);
752  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
753  .add(*Dest1)
754  .addReg(DestReg, RegState::Kill, SubRegIdx1);
755 
756  moveInstsAfter(Copy1, CI.InstsToMove);
757 
758  MachineBasicBlock::iterator Next = std::next(CI.I);
759  CI.I->eraseFromParent();
760  CI.Paired->eraseFromParent();
761 
762  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
763  return Next;
764 }
765 
766 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
767  if (STM->ldsRequiresM0Init())
768  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
769  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
770  : AMDGPU::DS_WRITE2_B64_gfx9;
771 }
772 
773 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
774  if (STM->ldsRequiresM0Init())
775  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
776  : AMDGPU::DS_WRITE2ST64_B64;
777 
778  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
779  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
780 }
781 
783 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
784  MachineBasicBlock *MBB = CI.I->getParent();
785 
786  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
787  // sure we preserve the subregister index and any register flags set on them.
788  const MachineOperand *AddrReg =
789  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
790  const MachineOperand *Data0 =
791  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
792  const MachineOperand *Data1 =
793  TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
794 
795  unsigned NewOffset0 = CI.Offset0;
796  unsigned NewOffset1 = CI.Offset1;
797  unsigned Opc =
798  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
799 
800  if (NewOffset0 > NewOffset1) {
801  // Canonicalize the merged instruction so the smaller offset comes first.
802  std::swap(NewOffset0, NewOffset1);
803  std::swap(Data0, Data1);
804  }
805 
806  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
807  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
808 
809  const MCInstrDesc &Write2Desc = TII->get(Opc);
810  DebugLoc DL = CI.I->getDebugLoc();
811 
812  unsigned BaseReg = AddrReg->getReg();
813  unsigned BaseSubReg = AddrReg->getSubReg();
814  unsigned BaseRegFlags = 0;
815  if (CI.BaseOff) {
816  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
817  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
818  .addImm(CI.BaseOff);
819 
820  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
821  BaseRegFlags = RegState::Kill;
822 
823  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
824  .addReg(ImmReg)
825  .addReg(AddrReg->getReg(), 0, BaseSubReg)
826  .addImm(0); // clamp bit
827  BaseSubReg = 0;
828  }
829 
830  MachineInstrBuilder Write2 =
831  BuildMI(*MBB, CI.Paired, DL, Write2Desc)
832  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
833  .add(*Data0) // data0
834  .add(*Data1) // data1
835  .addImm(NewOffset0) // offset0
836  .addImm(NewOffset1) // offset1
837  .addImm(0) // gds
838  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
839 
840  moveInstsAfter(Write2, CI.InstsToMove);
841 
842  MachineBasicBlock::iterator Next = std::next(CI.I);
843  CI.I->eraseFromParent();
844  CI.Paired->eraseFromParent();
845 
846  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
847  return Next;
848 }
849 
851 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
852  MachineBasicBlock *MBB = CI.I->getParent();
853  DebugLoc DL = CI.I->getDebugLoc();
854  const unsigned Opcode = getNewOpcode(CI);
855 
856  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
857 
858  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
859  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
860 
861  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
862  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
863  .addImm(MergedOffset) // offset
864  .addImm(CI.GLC0) // glc
865  .addImm(CI.DLC0) // dlc
866  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
867 
868  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
869  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
870  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
871 
872  // Copy to the old destination registers.
873  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
874  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
875  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
876 
877  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
878  .add(*Dest0) // Copy to same destination including flags and sub reg.
879  .addReg(DestReg, 0, SubRegIdx0);
880  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
881  .add(*Dest1)
882  .addReg(DestReg, RegState::Kill, SubRegIdx1);
883 
884  moveInstsAfter(Copy1, CI.InstsToMove);
885 
886  MachineBasicBlock::iterator Next = std::next(CI.I);
887  CI.I->eraseFromParent();
888  CI.Paired->eraseFromParent();
889  return Next;
890 }
891 
893 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
894  MachineBasicBlock *MBB = CI.I->getParent();
895  DebugLoc DL = CI.I->getDebugLoc();
896 
897  const unsigned Opcode = getNewOpcode(CI);
898 
899  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
900 
901  // Copy to the new source register.
902  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
903  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
904 
905  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
906 
907  const unsigned Regs = getRegs(Opcode);
908 
909  if (Regs & VADDR)
910  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
911 
912  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
913  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
914  .addImm(MergedOffset) // offset
915  .addImm(CI.GLC0) // glc
916  .addImm(CI.SLC0) // slc
917  .addImm(0) // tfe
918  .addImm(CI.DLC0) // dlc
919  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
920 
921  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
922  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
923  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
924 
925  // Copy to the old destination registers.
926  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
927  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
928  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
929 
930  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
931  .add(*Dest0) // Copy to same destination including flags and sub reg.
932  .addReg(DestReg, 0, SubRegIdx0);
933  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
934  .add(*Dest1)
935  .addReg(DestReg, RegState::Kill, SubRegIdx1);
936 
937  moveInstsAfter(Copy1, CI.InstsToMove);
938 
939  MachineBasicBlock::iterator Next = std::next(CI.I);
940  CI.I->eraseFromParent();
941  CI.Paired->eraseFromParent();
942  return Next;
943 }
944 
945 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
946  const unsigned Width = CI.Width0 + CI.Width1;
947 
948  switch (CI.InstClass) {
949  default:
950  return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
951  case UNKNOWN:
952  llvm_unreachable("Unknown instruction class");
953  case S_BUFFER_LOAD_IMM:
954  switch (Width) {
955  default:
956  return 0;
957  case 2:
958  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
959  case 4:
960  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
961  }
962  }
963 }
964 
965 std::pair<unsigned, unsigned>
966 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
967  if (CI.Offset0 > CI.Offset1) {
968  switch (CI.Width0) {
969  default:
970  return std::make_pair(0, 0);
971  case 1:
972  switch (CI.Width1) {
973  default:
974  return std::make_pair(0, 0);
975  case 1:
976  return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
977  case 2:
978  return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
979  case 3:
980  return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
981  }
982  case 2:
983  switch (CI.Width1) {
984  default:
985  return std::make_pair(0, 0);
986  case 1:
987  return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
988  case 2:
989  return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
990  }
991  case 3:
992  switch (CI.Width1) {
993  default:
994  return std::make_pair(0, 0);
995  case 1:
996  return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
997  }
998  }
999  } else {
1000  switch (CI.Width0) {
1001  default:
1002  return std::make_pair(0, 0);
1003  case 1:
1004  switch (CI.Width1) {
1005  default:
1006  return std::make_pair(0, 0);
1007  case 1:
1008  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1009  case 2:
1010  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1011  case 3:
1012  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1013  }
1014  case 2:
1015  switch (CI.Width1) {
1016  default:
1017  return std::make_pair(0, 0);
1018  case 1:
1019  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1020  case 2:
1021  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1022  }
1023  case 3:
1024  switch (CI.Width1) {
1025  default:
1026  return std::make_pair(0, 0);
1027  case 1:
1028  return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1029  }
1030  }
1031  }
1032 }
1033 
1034 const TargetRegisterClass *
1035 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1036  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1037  switch (CI.Width0 + CI.Width1) {
1038  default:
1039  return nullptr;
1040  case 2:
1041  return &AMDGPU::SReg_64_XEXECRegClass;
1042  case 4:
1043  return &AMDGPU::SReg_128RegClass;
1044  case 8:
1045  return &AMDGPU::SReg_256RegClass;
1046  case 16:
1047  return &AMDGPU::SReg_512RegClass;
1048  }
1049  } else {
1050  switch (CI.Width0 + CI.Width1) {
1051  default:
1052  return nullptr;
1053  case 2:
1054  return &AMDGPU::VReg_64RegClass;
1055  case 3:
1056  return &AMDGPU::VReg_96RegClass;
1057  case 4:
1058  return &AMDGPU::VReg_128RegClass;
1059  }
1060  }
1061 }
1062 
1064 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1065  MachineBasicBlock *MBB = CI.I->getParent();
1066  DebugLoc DL = CI.I->getDebugLoc();
1067 
1068  const unsigned Opcode = getNewOpcode(CI);
1069 
1070  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1071  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1072  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1073 
1074  // Copy to the new source register.
1075  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1076  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
1077 
1078  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1079  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1080 
1081  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1082  .add(*Src0)
1083  .addImm(SubRegIdx0)
1084  .add(*Src1)
1085  .addImm(SubRegIdx1);
1086 
1087  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1088  .addReg(SrcReg, RegState::Kill);
1089 
1090  const unsigned Regs = getRegs(Opcode);
1091 
1092  if (Regs & VADDR)
1093  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1094 
1095  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1096  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1097  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1098  .addImm(CI.GLC0) // glc
1099  .addImm(CI.SLC0) // slc
1100  .addImm(0) // tfe
1101  .addImm(CI.DLC0) // dlc
1102  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1103 
1104  moveInstsAfter(MIB, CI.InstsToMove);
1105 
1106  MachineBasicBlock::iterator Next = std::next(CI.I);
1107  CI.I->eraseFromParent();
1108  CI.Paired->eraseFromParent();
1109  return Next;
1110 }
1111 
1113 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1114  APInt V(32, Val, true);
1115  if (TII->isInlineConstant(V))
1116  return MachineOperand::CreateImm(Val);
1117 
1118  unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1119  MachineInstr *Mov =
1120  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1121  TII->get(AMDGPU::S_MOV_B32), Reg)
1122  .addImm(Val);
1123  (void)Mov;
1124  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1125  return MachineOperand::CreateReg(Reg, false);
1126 }
1127 
1128 // Compute base address using Addr and return the final register.
1129 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1130  const MemAddress &Addr) {
1131  MachineBasicBlock *MBB = MI.getParent();
1133  DebugLoc DL = MI.getDebugLoc();
1134 
1135  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1136  Addr.Base.LoSubReg) &&
1137  "Expected 32-bit Base-Register-Low!!");
1138 
1139  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1140  Addr.Base.HiSubReg) &&
1141  "Expected 32-bit Base-Register-Hi!!");
1142 
1143  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1144  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1145  MachineOperand OffsetHi =
1146  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1147  unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1148  unsigned DeadCarryReg =
1149  MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1150 
1151  unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1152  unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1153  MachineInstr *LoHalf =
1154  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1155  .addReg(CarryReg, RegState::Define)
1156  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1157  .add(OffsetLo)
1158  .addImm(0); // clamp bit
1159  (void)LoHalf;
1160  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1161 
1162  MachineInstr *HiHalf =
1163  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1164  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1165  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1166  .add(OffsetHi)
1167  .addReg(CarryReg, RegState::Kill)
1168  .addImm(0); // clamp bit
1169  (void)HiHalf;
1170  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1171 
1172  unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1173  MachineInstr *FullBase =
1174  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1175  .addReg(DestSub0)
1176  .addImm(AMDGPU::sub0)
1177  .addReg(DestSub1)
1178  .addImm(AMDGPU::sub1);
1179  (void)FullBase;
1180  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1181 
1182  return FullDestReg;
1183 }
1184 
1185 // Update base and offset with the NewBase and NewOffset in MI.
1186 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1187  unsigned NewBase,
1188  int32_t NewOffset) {
1189  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1190  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1191 }
1192 
1194 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1195  if (Op.isImm())
1196  return Op.getImm();
1197 
1198  if (!Op.isReg())
1199  return None;
1200 
1201  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1202  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1203  !Def->getOperand(1).isImm())
1204  return None;
1205 
1206  return Def->getOperand(1).getImm();
1207 }
1208 
1209 // Analyze Base and extracts:
1210 // - 32bit base registers, subregisters
1211 // - 64bit constant offset
1212 // Expecting base computation as:
1213 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1214 // %LO:vgpr_32, %c:sreg_64_xexec =
1215 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1216 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1217 // %Base:vreg_64 =
1218 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1219 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1220  MemAddress &Addr) {
1221  if (!Base.isReg())
1222  return;
1223 
1224  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1225  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1226  || Def->getNumOperands() != 5)
1227  return;
1228 
1229  MachineOperand BaseLo = Def->getOperand(1);
1230  MachineOperand BaseHi = Def->getOperand(3);
1231  if (!BaseLo.isReg() || !BaseHi.isReg())
1232  return;
1233 
1234  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1235  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1236 
1237  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1238  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1239  return;
1240 
1241  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1242  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1243 
1244  auto Offset0P = extractConstOffset(*Src0);
1245  if (Offset0P)
1246  BaseLo = *Src1;
1247  else {
1248  if (!(Offset0P = extractConstOffset(*Src1)))
1249  return;
1250  BaseLo = *Src0;
1251  }
1252 
1253  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1254  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1255 
1256  if (Src0->isImm())
1257  std::swap(Src0, Src1);
1258 
1259  if (!Src1->isImm())
1260  return;
1261 
1262  uint64_t Offset1 = Src1->getImm();
1263  BaseHi = *Src0;
1264 
1265  Addr.Base.LoReg = BaseLo.getReg();
1266  Addr.Base.HiReg = BaseHi.getReg();
1267  Addr.Base.LoSubReg = BaseLo.getSubReg();
1268  Addr.Base.HiSubReg = BaseHi.getSubReg();
1269  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1270 }
1271 
1272 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1273  MachineInstr &MI,
1274  MemInfoMap &Visited,
1275  SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1276 
1277  // TODO: Support flat and scratch.
1278  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1279  TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1280  return false;
1281 
1282  // TODO: Support Store.
1283  if (!MI.mayLoad())
1284  return false;
1285 
1286  if (AnchorList.count(&MI))
1287  return false;
1288 
1289  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1290 
1291  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1292  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1293  return false;
1294  }
1295 
1296  // Step1: Find the base-registers and a 64bit constant offset.
1297  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1298  MemAddress MAddr;
1299  if (Visited.find(&MI) == Visited.end()) {
1300  processBaseWithConstOffset(Base, MAddr);
1301  Visited[&MI] = MAddr;
1302  } else
1303  MAddr = Visited[&MI];
1304 
1305  if (MAddr.Offset == 0) {
1306  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1307  " constant offsets that can be promoted.\n";);
1308  return false;
1309  }
1310 
1311  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1312  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1313 
1314  // Step2: Traverse through MI's basic block and find an anchor(that has the
1315  // same base-registers) with the highest 13bit distance from MI's offset.
1316  // E.g. (64bit loads)
1317  // bb:
1318  // addr1 = &a + 4096; load1 = load(addr1, 0)
1319  // addr2 = &a + 6144; load2 = load(addr2, 0)
1320  // addr3 = &a + 8192; load3 = load(addr3, 0)
1321  // addr4 = &a + 10240; load4 = load(addr4, 0)
1322  // addr5 = &a + 12288; load5 = load(addr5, 0)
1323  //
1324  // Starting from the first load, the optimization will try to find a new base
1325  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1326  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1327  // as the new-base(anchor) because of the maximum distance which can
1328  // accomodate more intermediate bases presumeably.
1329  //
1330  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1331  // (&a + 8192) for load1, load2, load4.
1332  // addr = &a + 8192
1333  // load1 = load(addr, -4096)
1334  // load2 = load(addr, -2048)
1335  // load3 = load(addr, 0)
1336  // load4 = load(addr, 2048)
1337  // addr5 = &a + 12288; load5 = load(addr5, 0)
1338  //
1339  MachineInstr *AnchorInst = nullptr;
1340  MemAddress AnchorAddr;
1341  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1343 
1344  MachineBasicBlock *MBB = MI.getParent();
1347  ++MBBI;
1348  const SITargetLowering *TLI =
1349  static_cast<const SITargetLowering *>(STM->getTargetLowering());
1350 
1351  for ( ; MBBI != E; ++MBBI) {
1352  MachineInstr &MINext = *MBBI;
1353  // TODO: Support finding an anchor(with same base) from store addresses or
1354  // any other load addresses where the opcodes are different.
1355  if (MINext.getOpcode() != MI.getOpcode() ||
1356  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1357  continue;
1358 
1359  const MachineOperand &BaseNext =
1360  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1361  MemAddress MAddrNext;
1362  if (Visited.find(&MINext) == Visited.end()) {
1363  processBaseWithConstOffset(BaseNext, MAddrNext);
1364  Visited[&MINext] = MAddrNext;
1365  } else
1366  MAddrNext = Visited[&MINext];
1367 
1368  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1369  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1370  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1371  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1372  continue;
1373 
1374  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1375 
1376  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1378  AM.HasBaseReg = true;
1379  AM.BaseOffs = Dist;
1380  if (TLI->isLegalGlobalAddressingMode(AM) &&
1381  (uint32_t)std::abs(Dist) > MaxDist) {
1382  MaxDist = std::abs(Dist);
1383 
1384  AnchorAddr = MAddrNext;
1385  AnchorInst = &MINext;
1386  }
1387  }
1388 
1389  if (AnchorInst) {
1390  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1391  AnchorInst->dump());
1392  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1393  << AnchorAddr.Offset << "\n\n");
1394 
1395  // Instead of moving up, just re-compute anchor-instruction's base address.
1396  unsigned Base = computeBase(MI, AnchorAddr);
1397 
1398  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1399  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1400 
1401  for (auto P : InstsWCommonBase) {
1403  AM.HasBaseReg = true;
1404  AM.BaseOffs = P.second - AnchorAddr.Offset;
1405 
1406  if (TLI->isLegalGlobalAddressingMode(AM)) {
1407  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1408  dbgs() << ")"; P.first->dump());
1409  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1410  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1411  }
1412  }
1413  AnchorList.insert(AnchorInst);
1414  return true;
1415  }
1416 
1417  return false;
1418 }
1419 
1420 // Scan through looking for adjacent LDS operations with constant offsets from
1421 // the same base register. We rely on the scheduler to do the hard work of
1422 // clustering nearby loads, and assume these are all adjacent.
1423 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
1424  bool Modified = false;
1425 
1426  // Contain the list
1427  MemInfoMap Visited;
1428  // Contains the list of instructions for which constant offsets are being
1429  // promoted to the IMM.
1430  SmallPtrSet<MachineInstr *, 4> AnchorList;
1431 
1432  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1433  MachineInstr &MI = *I;
1434 
1435  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1436  Modified = true;
1437 
1438  // Don't combine if volatile.
1439  if (MI.hasOrderedMemoryRef()) {
1440  ++I;
1441  continue;
1442  }
1443 
1444  const unsigned Opc = MI.getOpcode();
1445 
1446  CombineInfo CI;
1447  CI.I = I;
1448  CI.InstClass = getInstClass(Opc);
1449 
1450  switch (CI.InstClass) {
1451  default:
1452  break;
1453  case DS_READ:
1454  CI.EltSize =
1455  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1456  : 4;
1457  if (findMatchingInst(CI)) {
1458  Modified = true;
1459  I = mergeRead2Pair(CI);
1460  } else {
1461  ++I;
1462  }
1463  continue;
1464  case DS_WRITE:
1465  CI.EltSize =
1466  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1467  : 4;
1468  if (findMatchingInst(CI)) {
1469  Modified = true;
1470  I = mergeWrite2Pair(CI);
1471  } else {
1472  ++I;
1473  }
1474  continue;
1475  case S_BUFFER_LOAD_IMM:
1476  CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
1477  if (findMatchingInst(CI)) {
1478  Modified = true;
1479  I = mergeSBufferLoadImmPair(CI);
1480  OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
1481  } else {
1482  ++I;
1483  }
1484  continue;
1485  case BUFFER_LOAD_OFFEN:
1486  case BUFFER_LOAD_OFFSET:
1487  case BUFFER_LOAD_OFFEN_exact:
1488  case BUFFER_LOAD_OFFSET_exact:
1489  CI.EltSize = 4;
1490  if (findMatchingInst(CI)) {
1491  Modified = true;
1492  I = mergeBufferLoadPair(CI);
1493  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1494  } else {
1495  ++I;
1496  }
1497  continue;
1498  case BUFFER_STORE_OFFEN:
1499  case BUFFER_STORE_OFFSET:
1500  case BUFFER_STORE_OFFEN_exact:
1501  case BUFFER_STORE_OFFSET_exact:
1502  CI.EltSize = 4;
1503  if (findMatchingInst(CI)) {
1504  Modified = true;
1505  I = mergeBufferStorePair(CI);
1506  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1507  } else {
1508  ++I;
1509  }
1510  continue;
1511  }
1512 
1513  ++I;
1514  }
1515 
1516  return Modified;
1517 }
1518 
1519 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1520  if (skipFunction(MF.getFunction()))
1521  return false;
1522 
1523  STM = &MF.getSubtarget<GCNSubtarget>();
1524  if (!STM->loadStoreOptEnabled())
1525  return false;
1526 
1527  TII = STM->getInstrInfo();
1528  TRI = &TII->getRegisterInfo();
1529 
1530  MRI = &MF.getRegInfo();
1531  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1532 
1533  assert(MRI->isSSA() && "Must be run on SSA");
1534 
1535  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1536 
1537  bool Modified = false;
1538 
1539  for (MachineBasicBlock &MBB : MF) {
1540  do {
1541  OptimizeAgain = false;
1542  Modified |= optimizeBlock(MBB);
1543  } while (OptimizeAgain);
1544  }
1545 
1546  return Modified;
1547 }
static bool isReg(const MCInst &MI, unsigned OpNo)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords)
const MachineInstrBuilder & add(const MachineOperand &MO) const
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Interface definition for SIRegisterInfo.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
bool getMUBUFHasSrsrc(unsigned Opc)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
char & SILoadStoreOptimizerID
void push_back(const T &Elt)
Definition: SmallVector.h:211
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:382
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:163
unsigned getReg() const
getReg - Returns the register number.
unsigned Reg
unsigned getSubReg() const
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:829
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
int getMUBUFDwords(unsigned Opc)
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:458
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateReg(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
bool hasDwordx3LoadStores() const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:411
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:408
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, AliasAnalysis *AA)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, AliasAnalysis *AA)
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
#define P(N)
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:342
unsigned const MachineRegisterInfo * MRI
bool getMUBUFHasSoffset(unsigned Opc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:187
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
int getMUBUFBaseOpcode(unsigned Opc)
Represent the analysis usage information of a pass.
SI Load Store Optimizer
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
self_iterator getIterator()
Definition: ilist_node.h:81
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
static uint64_t add(uint64_t LeftOp, uint64_t RightOp)
Definition: FileCheck.cpp:124
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:841
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
Class for arbitrary precision integers.
Definition: APInt.h:69
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:253
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool loadStoreOptEnabled() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static MachineOperand CreateImm(int64_t Val)
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1212
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:91
bool isReg() const
isReg - Tests if this is a MO_Register operand.
FunctionPass * createSILoadStoreOptimizerPass()
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be iniitalized. ...
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:806
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
bool getMUBUFHasVAddr(unsigned Opc)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const SITargetLowering * getTargetLowering() const override
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:413
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)