LLVM  8.0.0svn
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass tries to fuse DS instructions with close by immediate offsets.
11 // This will fuse operations such as
12 // ds_read_b32 v0, v2 offset:16
13 // ds_read_b32 v1, v2 offset:32
14 // ==>
15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 //
17 // The same is done for certain SMEM and VMEM opcodes, e.g.:
18 // s_buffer_load_dword s4, s[0:3], 4
19 // s_buffer_load_dword s5, s[0:3], 8
20 // ==>
21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 //
23 //
24 // Future improvements:
25 //
26 // - This currently relies on the scheduler to place loads and stores next to
27 // each other, and then only merges adjacent pairs of instructions. It would
28 // be good to be more flexible with interleaved instructions, and possibly run
29 // before scheduling. It currently missing stores of constants because loading
30 // the constant into the data register is placed between the stores, although
31 // this is arguably a scheduling problem.
32 //
33 // - Live interval recomputing seems inefficient. This currently only matches
34 // one pair, and recomputes live intervals and moves on to the next pair. It
35 // would be better to compute a list of all merges that need to occur.
36 //
37 // - With a list of instructions to process, we can also merge more. If a
38 // cluster of loads have offsets that are too large to fit in the 8-bit
39 // offsets, but are close enough to fit in the 8 bits, we can add to the base
40 // pointer and use the new reduced offsets.
41 //
42 //===----------------------------------------------------------------------===//
43 
44 #include "AMDGPU.h"
45 #include "AMDGPUSubtarget.h"
46 #include "SIInstrInfo.h"
47 #include "SIRegisterInfo.h"
49 #include "Utils/AMDGPUBaseInfo.h"
50 #include "llvm/ADT/ArrayRef.h"
51 #include "llvm/ADT/SmallVector.h"
52 #include "llvm/ADT/StringRef.h"
61 #include "llvm/IR/DebugLoc.h"
62 #include "llvm/Pass.h"
63 #include "llvm/Support/Debug.h"
66 #include <algorithm>
67 #include <cassert>
68 #include <cstdlib>
69 #include <iterator>
70 #include <utility>
71 
72 using namespace llvm;
73 
74 #define DEBUG_TYPE "si-load-store-opt"
75 
76 namespace {
77 
78 class SILoadStoreOptimizer : public MachineFunctionPass {
79  enum InstClassEnum {
80  DS_READ_WRITE,
81  S_BUFFER_LOAD_IMM,
82  BUFFER_LOAD_OFFEN,
83  BUFFER_LOAD_OFFSET,
84  BUFFER_STORE_OFFEN,
85  BUFFER_STORE_OFFSET,
86  };
87 
88  struct CombineInfo {
91  unsigned EltSize;
92  unsigned Offset0;
93  unsigned Offset1;
94  unsigned BaseOff;
95  InstClassEnum InstClass;
96  bool GLC0;
97  bool GLC1;
98  bool SLC0;
99  bool SLC1;
100  bool UseST64;
101  bool IsX2;
102  SmallVector<MachineInstr*, 8> InstsToMove;
103  };
104 
105 private:
106  const GCNSubtarget *STM = nullptr;
107  const SIInstrInfo *TII = nullptr;
108  const SIRegisterInfo *TRI = nullptr;
109  MachineRegisterInfo *MRI = nullptr;
110  AliasAnalysis *AA = nullptr;
111  unsigned CreatedX2;
112 
113  static bool offsetsCanBeCombined(CombineInfo &CI);
114 
115  bool findMatchingInst(CombineInfo &CI);
116 
117  unsigned read2Opcode(unsigned EltSize) const;
118  unsigned read2ST64Opcode(unsigned EltSize) const;
119  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
120 
121  unsigned write2Opcode(unsigned EltSize) const;
122  unsigned write2ST64Opcode(unsigned EltSize) const;
123  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
124  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
125  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
126  unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
127  bool &IsOffen) const;
128  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
129 
130 public:
131  static char ID;
132 
133  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
135  }
136 
137  bool optimizeBlock(MachineBasicBlock &MBB);
138 
139  bool runOnMachineFunction(MachineFunction &MF) override;
140 
141  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
142 
143  void getAnalysisUsage(AnalysisUsage &AU) const override {
144  AU.setPreservesCFG();
146 
148  }
149 };
150 
151 } // end anonymous namespace.
152 
153 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
154  "SI Load Store Optimizer", false, false)
156 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
158 
159 char SILoadStoreOptimizer::ID = 0;
160 
161 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
162 
164  return new SILoadStoreOptimizer();
165 }
166 
168  ArrayRef<MachineInstr*> InstsToMove) {
169  MachineBasicBlock *MBB = I->getParent();
170  ++I;
171  for (MachineInstr *MI : InstsToMove) {
172  MI->removeFromParent();
173  MBB->insert(I, MI);
174  }
175 }
176 
177 static void addDefsUsesToList(const MachineInstr &MI,
178  DenseSet<unsigned> &RegDefs,
179  DenseSet<unsigned> &PhysRegUses) {
180  for (const MachineOperand &Op : MI.operands()) {
181  if (Op.isReg()) {
182  if (Op.isDef())
183  RegDefs.insert(Op.getReg());
184  else if (Op.readsReg() &&
186  PhysRegUses.insert(Op.getReg());
187  }
188  }
189 }
190 
193  const SIInstrInfo *TII,
194  AliasAnalysis * AA) {
195  // RAW or WAR - cannot reorder
196  // WAW - cannot reorder
197  // RAR - safe to reorder
198  return !(A->mayStore() || B->mayStore()) ||
199  TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
200 }
201 
202 // Add MI and its defs to the lists if MI reads one of the defs that are
203 // already in the list. Returns true in that case.
204 static bool
206  DenseSet<unsigned> &RegDefs,
207  DenseSet<unsigned> &PhysRegUses,
209  for (MachineOperand &Use : MI.operands()) {
210  // If one of the defs is read, then there is a use of Def between I and the
211  // instruction that I will potentially be merged with. We will need to move
212  // this instruction after the merged instructions.
213  //
214  // Similarly, if there is a def which is read by an instruction that is to
215  // be moved for merging, then we need to move the def-instruction as well.
216  // This can only happen for physical registers such as M0; virtual
217  // registers are in SSA form.
218  if (Use.isReg() &&
219  ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
220  (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
221  PhysRegUses.count(Use.getReg())))) {
222  Insts.push_back(&MI);
223  addDefsUsesToList(MI, RegDefs, PhysRegUses);
224  return true;
225  }
226  }
227 
228  return false;
229 }
230 
231 static bool
233  ArrayRef<MachineInstr*> InstsToMove,
234  const SIInstrInfo *TII,
235  AliasAnalysis *AA) {
236  assert(MemOp.mayLoadOrStore());
237 
238  for (MachineInstr *InstToMove : InstsToMove) {
239  if (!InstToMove->mayLoadOrStore())
240  continue;
241  if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
242  return false;
243  }
244  return true;
245 }
246 
247 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
248  // XXX - Would the same offset be OK? Is there any reason this would happen or
249  // be useful?
250  if (CI.Offset0 == CI.Offset1)
251  return false;
252 
253  // This won't be valid if the offset isn't aligned.
254  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
255  return false;
256 
257  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
258  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
259  CI.UseST64 = false;
260  CI.BaseOff = 0;
261 
262  // Handle SMEM and VMEM instructions.
263  if (CI.InstClass != DS_READ_WRITE) {
264  unsigned Diff = CI.IsX2 ? 2 : 1;
265  return (EltOffset0 + Diff == EltOffset1 ||
266  EltOffset1 + Diff == EltOffset0) &&
267  CI.GLC0 == CI.GLC1 &&
268  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
269  }
270 
271  // If the offset in elements doesn't fit in 8-bits, we might be able to use
272  // the stride 64 versions.
273  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
274  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
275  CI.Offset0 = EltOffset0 / 64;
276  CI.Offset1 = EltOffset1 / 64;
277  CI.UseST64 = true;
278  return true;
279  }
280 
281  // Check if the new offsets fit in the reduced 8-bit range.
282  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
283  CI.Offset0 = EltOffset0;
284  CI.Offset1 = EltOffset1;
285  return true;
286  }
287 
288  // Try to shift base address to decrease offsets.
289  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
290  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
291 
292  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
293  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
294  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
295  CI.UseST64 = true;
296  return true;
297  }
298 
299  if (isUInt<8>(OffsetDiff)) {
300  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
301  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
302  return true;
303  }
304 
305  return false;
306 }
307 
308 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
309  MachineBasicBlock *MBB = CI.I->getParent();
311  MachineBasicBlock::iterator MBBI = CI.I;
312 
313  unsigned AddrOpName[3] = {0};
314  int AddrIdx[3];
315  const MachineOperand *AddrReg[3];
316  unsigned NumAddresses = 0;
317 
318  switch (CI.InstClass) {
319  case DS_READ_WRITE:
320  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
321  break;
322  case S_BUFFER_LOAD_IMM:
323  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
324  break;
325  case BUFFER_LOAD_OFFEN:
326  case BUFFER_STORE_OFFEN:
327  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
328  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
329  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
330  break;
331  case BUFFER_LOAD_OFFSET:
332  case BUFFER_STORE_OFFSET:
333  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
334  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
335  break;
336  }
337 
338  for (unsigned i = 0; i < NumAddresses; i++) {
339  AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
340  AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
341 
342  // We only ever merge operations with the same base address register, so don't
343  // bother scanning forward if there are no other uses.
344  if (AddrReg[i]->isReg() &&
346  MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
347  return false;
348  }
349 
350  ++MBBI;
351 
352  DenseSet<unsigned> RegDefsToMove;
353  DenseSet<unsigned> PhysRegUsesToMove;
354  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
355 
356  for ( ; MBBI != E; ++MBBI) {
357  if (MBBI->getOpcode() != CI.I->getOpcode()) {
358  // This is not a matching DS instruction, but we can keep looking as
359  // long as one of these conditions are met:
360  // 1. It is safe to move I down past MBBI.
361  // 2. It is safe to move MBBI down past the instruction that I will
362  // be merged into.
363 
364  if (MBBI->hasUnmodeledSideEffects()) {
365  // We can't re-order this instruction with respect to other memory
366  // operations, so we fail both conditions mentioned above.
367  return false;
368  }
369 
370  if (MBBI->mayLoadOrStore() &&
371  (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
372  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
373  // We fail condition #1, but we may still be able to satisfy condition
374  // #2. Add this instruction to the move list and then we will check
375  // if condition #2 holds once we have selected the matching instruction.
376  CI.InstsToMove.push_back(&*MBBI);
377  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
378  continue;
379  }
380 
381  // When we match I with another DS instruction we will be moving I down
382  // to the location of the matched instruction any uses of I will need to
383  // be moved down as well.
384  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
385  CI.InstsToMove);
386  continue;
387  }
388 
389  // Don't merge volatiles.
390  if (MBBI->hasOrderedMemoryRef())
391  return false;
392 
393  // Handle a case like
394  // DS_WRITE_B32 addr, v, idx0
395  // w = DS_READ_B32 addr, idx0
396  // DS_WRITE_B32 addr, f(w), idx1
397  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
398  // merging of the two writes.
399  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
400  CI.InstsToMove))
401  continue;
402 
403  bool Match = true;
404  for (unsigned i = 0; i < NumAddresses; i++) {
405  const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
406 
407  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
408  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
409  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
410  Match = false;
411  break;
412  }
413  continue;
414  }
415 
416  // Check same base pointer. Be careful of subregisters, which can occur with
417  // vectors of pointers.
418  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
419  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
420  Match = false;
421  break;
422  }
423  }
424 
425  if (Match) {
426  int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
427  AMDGPU::OpName::offset);
428  CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
429  CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
430  CI.Paired = MBBI;
431 
432  if (CI.InstClass == DS_READ_WRITE) {
433  CI.Offset0 &= 0xffff;
434  CI.Offset1 &= 0xffff;
435  } else {
436  CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
437  CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
438  if (CI.InstClass != S_BUFFER_LOAD_IMM) {
439  CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
440  CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
441  }
442  }
443 
444  // Check both offsets fit in the reduced range.
445  // We also need to go through the list of instructions that we plan to
446  // move and make sure they are all safe to move down past the merged
447  // instruction.
448  if (offsetsCanBeCombined(CI))
449  if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
450  return true;
451  }
452 
453  // We've found a load/store that we couldn't merge for some reason.
454  // We could potentially keep looking, but we'd need to make sure that
455  // it was safe to move I and also all the instruction in InstsToMove
456  // down past this instruction.
457  // check if we can move I across MBBI and if we can move all I's users
458  if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
459  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
460  break;
461  }
462  return false;
463 }
464 
465 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
466  if (STM->ldsRequiresM0Init())
467  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
468  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
469 }
470 
471 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
472  if (STM->ldsRequiresM0Init())
473  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
474 
475  return (EltSize == 4) ?
476  AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
477 }
478 
479 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
480  CombineInfo &CI) {
481  MachineBasicBlock *MBB = CI.I->getParent();
482 
483  // Be careful, since the addresses could be subregisters themselves in weird
484  // cases, like vectors of pointers.
485  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
486 
487  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
488  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
489 
490  unsigned NewOffset0 = CI.Offset0;
491  unsigned NewOffset1 = CI.Offset1;
492  unsigned Opc = CI.UseST64 ?
493  read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
494 
495  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
496  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
497 
498  if (NewOffset0 > NewOffset1) {
499  // Canonicalize the merged instruction so the smaller offset comes first.
500  std::swap(NewOffset0, NewOffset1);
501  std::swap(SubRegIdx0, SubRegIdx1);
502  }
503 
504  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
505  (NewOffset0 != NewOffset1) &&
506  "Computed offset doesn't fit");
507 
508  const MCInstrDesc &Read2Desc = TII->get(Opc);
509 
510  const TargetRegisterClass *SuperRC
511  = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
512  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
513 
514  DebugLoc DL = CI.I->getDebugLoc();
515 
516  unsigned BaseReg = AddrReg->getReg();
517  unsigned BaseRegFlags = 0;
518  if (CI.BaseOff) {
519  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
520  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
521  .addImm(CI.BaseOff);
522 
523  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
524  BaseRegFlags = RegState::Kill;
525 
526  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
527  .addReg(ImmReg)
528  .addReg(AddrReg->getReg());
529  }
530 
531  MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
532  .addReg(BaseReg, BaseRegFlags) // addr
533  .addImm(NewOffset0) // offset0
534  .addImm(NewOffset1) // offset1
535  .addImm(0) // gds
536  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
537 
538  (void)Read2;
539 
540  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
541 
542  // Copy to the old destination registers.
543  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
544  .add(*Dest0) // Copy to same destination including flags and sub reg.
545  .addReg(DestReg, 0, SubRegIdx0);
546  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
547  .add(*Dest1)
548  .addReg(DestReg, RegState::Kill, SubRegIdx1);
549 
550  moveInstsAfter(Copy1, CI.InstsToMove);
551 
552  MachineBasicBlock::iterator Next = std::next(CI.I);
553  CI.I->eraseFromParent();
554  CI.Paired->eraseFromParent();
555 
556  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
557  return Next;
558 }
559 
560 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
561  if (STM->ldsRequiresM0Init())
562  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
563  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
564 }
565 
566 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
567  if (STM->ldsRequiresM0Init())
568  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
569 
570  return (EltSize == 4) ?
571  AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
572 }
573 
574 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
575  CombineInfo &CI) {
576  MachineBasicBlock *MBB = CI.I->getParent();
577 
578  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
579  // sure we preserve the subregister index and any register flags set on them.
580  const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
581  const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
582  const MachineOperand *Data1
583  = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
584 
585  unsigned NewOffset0 = CI.Offset0;
586  unsigned NewOffset1 = CI.Offset1;
587  unsigned Opc = CI.UseST64 ?
588  write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
589 
590  if (NewOffset0 > NewOffset1) {
591  // Canonicalize the merged instruction so the smaller offset comes first.
592  std::swap(NewOffset0, NewOffset1);
593  std::swap(Data0, Data1);
594  }
595 
596  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
597  (NewOffset0 != NewOffset1) &&
598  "Computed offset doesn't fit");
599 
600  const MCInstrDesc &Write2Desc = TII->get(Opc);
601  DebugLoc DL = CI.I->getDebugLoc();
602 
603  unsigned BaseReg = AddrReg->getReg();
604  unsigned BaseRegFlags = 0;
605  if (CI.BaseOff) {
606  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
607  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
608  .addImm(CI.BaseOff);
609 
610  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
611  BaseRegFlags = RegState::Kill;
612 
613  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
614  .addReg(ImmReg)
615  .addReg(AddrReg->getReg());
616  }
617 
618  MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc)
619  .addReg(BaseReg, BaseRegFlags) // addr
620  .add(*Data0) // data0
621  .add(*Data1) // data1
622  .addImm(NewOffset0) // offset0
623  .addImm(NewOffset1) // offset1
624  .addImm(0) // gds
625  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
626 
627  moveInstsAfter(Write2, CI.InstsToMove);
628 
629  MachineBasicBlock::iterator Next = std::next(CI.I);
630  CI.I->eraseFromParent();
631  CI.Paired->eraseFromParent();
632 
633  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
634  return Next;
635 }
636 
637 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
638  CombineInfo &CI) {
639  MachineBasicBlock *MBB = CI.I->getParent();
640  DebugLoc DL = CI.I->getDebugLoc();
641  unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
642  AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
643 
644  const TargetRegisterClass *SuperRC =
645  CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
646  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
647  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
648 
649  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
650  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
651  .addImm(MergedOffset) // offset
652  .addImm(CI.GLC0) // glc
653  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
654 
655  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
656  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
657 
658  // Handle descending offsets
659  if (CI.Offset0 > CI.Offset1)
660  std::swap(SubRegIdx0, SubRegIdx1);
661 
662  // Copy to the old destination registers.
663  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
664  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
665  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
666 
667  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
668  .add(*Dest0) // Copy to same destination including flags and sub reg.
669  .addReg(DestReg, 0, SubRegIdx0);
670  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
671  .add(*Dest1)
672  .addReg(DestReg, RegState::Kill, SubRegIdx1);
673 
674  moveInstsAfter(Copy1, CI.InstsToMove);
675 
676  MachineBasicBlock::iterator Next = std::next(CI.I);
677  CI.I->eraseFromParent();
678  CI.Paired->eraseFromParent();
679  return Next;
680 }
681 
682 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
683  CombineInfo &CI) {
684  MachineBasicBlock *MBB = CI.I->getParent();
685  DebugLoc DL = CI.I->getDebugLoc();
686  unsigned Opcode;
687 
688  if (CI.InstClass == BUFFER_LOAD_OFFEN) {
689  Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
690  AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
691  } else {
692  Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
693  AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
694  }
695 
696  const TargetRegisterClass *SuperRC =
697  CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
698  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
699  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
700 
701  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
702 
703  if (CI.InstClass == BUFFER_LOAD_OFFEN)
704  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
705 
706  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
707  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
708  .addImm(MergedOffset) // offset
709  .addImm(CI.GLC0) // glc
710  .addImm(CI.SLC0) // slc
711  .addImm(0) // tfe
712  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
713 
714  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
715  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
716 
717  // Handle descending offsets
718  if (CI.Offset0 > CI.Offset1)
719  std::swap(SubRegIdx0, SubRegIdx1);
720 
721  // Copy to the old destination registers.
722  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
723  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
724  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
725 
726  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
727  .add(*Dest0) // Copy to same destination including flags and sub reg.
728  .addReg(DestReg, 0, SubRegIdx0);
729  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
730  .add(*Dest1)
731  .addReg(DestReg, RegState::Kill, SubRegIdx1);
732 
733  moveInstsAfter(Copy1, CI.InstsToMove);
734 
735  MachineBasicBlock::iterator Next = std::next(CI.I);
736  CI.I->eraseFromParent();
737  CI.Paired->eraseFromParent();
738  return Next;
739 }
740 
741 unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
742  const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
743  IsX2 = false;
744  IsOffen = false;
745 
746  switch (I.getOpcode()) {
747  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
748  IsOffen = true;
749  return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
750  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
751  IsOffen = true;
752  return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
753  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
754  IsX2 = true;
755  IsOffen = true;
756  return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
757  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
758  IsX2 = true;
759  IsOffen = true;
760  return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
761  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
762  return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
763  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
764  return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
765  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
766  IsX2 = true;
767  return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
768  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
769  IsX2 = true;
770  return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
771  }
772  return 0;
773 }
774 
775 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
776  CombineInfo &CI) {
777  MachineBasicBlock *MBB = CI.I->getParent();
778  DebugLoc DL = CI.I->getDebugLoc();
779  bool Unused1, Unused2;
780  unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
781 
782  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
783  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
784 
785  // Handle descending offsets
786  if (CI.Offset0 > CI.Offset1)
787  std::swap(SubRegIdx0, SubRegIdx1);
788 
789  // Copy to the new source register.
790  const TargetRegisterClass *SuperRC =
791  CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
792  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
793 
794  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
795  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
796 
797  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
798  .add(*Src0)
799  .addImm(SubRegIdx0)
800  .add(*Src1)
801  .addImm(SubRegIdx1);
802 
803  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
804  .addReg(SrcReg, RegState::Kill);
805 
806  if (CI.InstClass == BUFFER_STORE_OFFEN)
807  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
808 
809  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
810  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
811  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
812  .addImm(CI.GLC0) // glc
813  .addImm(CI.SLC0) // slc
814  .addImm(0) // tfe
815  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
816 
817  moveInstsAfter(MIB, CI.InstsToMove);
818 
819  MachineBasicBlock::iterator Next = std::next(CI.I);
820  CI.I->eraseFromParent();
821  CI.Paired->eraseFromParent();
822  return Next;
823 }
824 
825 // Scan through looking for adjacent LDS operations with constant offsets from
826 // the same base register. We rely on the scheduler to do the hard work of
827 // clustering nearby loads, and assume these are all adjacent.
828 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
829  bool Modified = false;
830 
831  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
832  MachineInstr &MI = *I;
833 
834  // Don't combine if volatile.
835  if (MI.hasOrderedMemoryRef()) {
836  ++I;
837  continue;
838  }
839 
840  CombineInfo CI;
841  CI.I = I;
842  unsigned Opc = MI.getOpcode();
843  if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
844  Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
845 
846  CI.InstClass = DS_READ_WRITE;
847  CI.EltSize =
848  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
849 
850  if (findMatchingInst(CI)) {
851  Modified = true;
852  I = mergeRead2Pair(CI);
853  } else {
854  ++I;
855  }
856 
857  continue;
858  } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
859  Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
860  Opc == AMDGPU::DS_WRITE_B64_gfx9) {
861  CI.InstClass = DS_READ_WRITE;
862  CI.EltSize
863  = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
864 
865  if (findMatchingInst(CI)) {
866  Modified = true;
867  I = mergeWrite2Pair(CI);
868  } else {
869  ++I;
870  }
871 
872  continue;
873  }
874  if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
875  Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
876  // EltSize is in units of the offset encoding.
877  CI.InstClass = S_BUFFER_LOAD_IMM;
878  CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
879  CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
880  if (findMatchingInst(CI)) {
881  Modified = true;
882  I = mergeSBufferLoadImmPair(CI);
883  if (!CI.IsX2)
884  CreatedX2++;
885  } else {
886  ++I;
887  }
888  continue;
889  }
890  if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
891  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
892  Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
893  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
894  if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
895  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
896  CI.InstClass = BUFFER_LOAD_OFFEN;
897  else
898  CI.InstClass = BUFFER_LOAD_OFFSET;
899 
900  CI.EltSize = 4;
901  CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
902  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
903  if (findMatchingInst(CI)) {
904  Modified = true;
905  I = mergeBufferLoadPair(CI);
906  if (!CI.IsX2)
907  CreatedX2++;
908  } else {
909  ++I;
910  }
911  continue;
912  }
913 
914  bool StoreIsX2, IsOffen;
915  if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
916  CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
917  CI.EltSize = 4;
918  CI.IsX2 = StoreIsX2;
919  if (findMatchingInst(CI)) {
920  Modified = true;
921  I = mergeBufferStorePair(CI);
922  if (!CI.IsX2)
923  CreatedX2++;
924  } else {
925  ++I;
926  }
927  continue;
928  }
929 
930  ++I;
931  }
932 
933  return Modified;
934 }
935 
936 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
937  if (skipFunction(MF.getFunction()))
938  return false;
939 
940  STM = &MF.getSubtarget<GCNSubtarget>();
941  if (!STM->loadStoreOptEnabled())
942  return false;
943 
944  TII = STM->getInstrInfo();
945  TRI = &TII->getRegisterInfo();
946 
947  MRI = &MF.getRegInfo();
948  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
949 
950  assert(MRI->isSSA() && "Must be run on SSA");
951 
952  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
953 
954  bool Modified = false;
955 
956  for (MachineBasicBlock &MBB : MF) {
957  CreatedX2 = 0;
958  Modified |= optimizeBlock(MBB);
959 
960  // Run again to convert x2 to x4.
961  if (CreatedX2 >= 1)
962  Modified |= optimizeBlock(MBB);
963  }
964 
965  return Modified;
966 }
static bool isReg(const MCInst &MI, unsigned OpNo)
void push_back(const T &Elt)
Definition: SmallVector.h:218
const MachineInstrBuilder & add(const MachineOperand &MO) const
Interface definition for SIRegisterInfo.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
char & SILoadStoreOptimizerID
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:163
unsigned getReg() const
getReg - Returns the register number.
unsigned getSubReg() const
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:830
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:459
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, AliasAnalysis *AA)
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:343
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:187
Represent the analysis usage information of a pass.
SI Load Store Optimizer
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA=nullptr) const override
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:941
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1213
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, const SIInstrInfo *TII, AliasAnalysis *AA)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:91
FunctionPass * createSILoadStoreOptimizerPass()
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
#define LLVM_DEBUG(X)
Definition: Debug.h:123
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)