LLVM  7.0.0svn
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass tries to fuse DS instructions with close by immediate offsets.
11 // This will fuse operations such as
12 // ds_read_b32 v0, v2 offset:16
13 // ds_read_b32 v1, v2 offset:32
14 // ==>
15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 //
17 // The same is done for certain SMEM and VMEM opcodes, e.g.:
18 // s_buffer_load_dword s4, s[0:3], 4
19 // s_buffer_load_dword s5, s[0:3], 8
20 // ==>
21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 //
23 //
24 // Future improvements:
25 //
26 // - This currently relies on the scheduler to place loads and stores next to
27 // each other, and then only merges adjacent pairs of instructions. It would
28 // be good to be more flexible with interleaved instructions, and possibly run
29 // before scheduling. It currently missing stores of constants because loading
30 // the constant into the data register is placed between the stores, although
31 // this is arguably a scheduling problem.
32 //
33 // - Live interval recomputing seems inefficient. This currently only matches
34 // one pair, and recomputes live intervals and moves on to the next pair. It
35 // would be better to compute a list of all merges that need to occur.
36 //
37 // - With a list of instructions to process, we can also merge more. If a
38 // cluster of loads have offsets that are too large to fit in the 8-bit
39 // offsets, but are close enough to fit in the 8 bits, we can add to the base
40 // pointer and use the new reduced offsets.
41 //
42 //===----------------------------------------------------------------------===//
43 
44 #include "AMDGPU.h"
45 #include "AMDGPUSubtarget.h"
46 #include "SIInstrInfo.h"
47 #include "SIRegisterInfo.h"
49 #include "Utils/AMDGPUBaseInfo.h"
50 #include "llvm/ADT/ArrayRef.h"
51 #include "llvm/ADT/SmallVector.h"
52 #include "llvm/ADT/StringRef.h"
61 #include "llvm/IR/DebugLoc.h"
62 #include "llvm/Pass.h"
63 #include "llvm/Support/Debug.h"
66 #include <algorithm>
67 #include <cassert>
68 #include <cstdlib>
69 #include <iterator>
70 #include <utility>
71 
72 using namespace llvm;
73 
74 #define DEBUG_TYPE "si-load-store-opt"
75 
76 namespace {
77 
78 class SILoadStoreOptimizer : public MachineFunctionPass {
79  enum InstClassEnum {
80  DS_READ_WRITE,
81  S_BUFFER_LOAD_IMM,
82  BUFFER_LOAD_OFFEN,
83  BUFFER_LOAD_OFFSET,
84  BUFFER_STORE_OFFEN,
85  BUFFER_STORE_OFFSET,
86  };
87 
88  struct CombineInfo {
91  unsigned EltSize;
92  unsigned Offset0;
93  unsigned Offset1;
94  unsigned BaseOff;
95  InstClassEnum InstClass;
96  bool GLC0;
97  bool GLC1;
98  bool SLC0;
99  bool SLC1;
100  bool UseST64;
101  bool IsX2;
102  SmallVector<MachineInstr*, 8> InstsToMove;
103  };
104 
105 private:
106  const SISubtarget *STM = nullptr;
107  const SIInstrInfo *TII = nullptr;
108  const SIRegisterInfo *TRI = nullptr;
109  MachineRegisterInfo *MRI = nullptr;
110  AliasAnalysis *AA = nullptr;
111  unsigned CreatedX2;
112 
113  static bool offsetsCanBeCombined(CombineInfo &CI);
114 
115  bool findMatchingInst(CombineInfo &CI);
116 
117  unsigned read2Opcode(unsigned EltSize) const;
118  unsigned read2ST64Opcode(unsigned EltSize) const;
119  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
120 
121  unsigned write2Opcode(unsigned EltSize) const;
122  unsigned write2ST64Opcode(unsigned EltSize) const;
123  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
124  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
125  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
126  unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
127  bool &IsOffen) const;
128  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
129 
130 public:
131  static char ID;
132 
133  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
135  }
136 
137  bool optimizeBlock(MachineBasicBlock &MBB);
138 
139  bool runOnMachineFunction(MachineFunction &MF) override;
140 
141  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
142 
143  void getAnalysisUsage(AnalysisUsage &AU) const override {
144  AU.setPreservesCFG();
146 
148  }
149 };
150 
151 } // end anonymous namespace.
152 
153 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
154  "SI Load Store Optimizer", false, false)
156 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
158 
159 char SILoadStoreOptimizer::ID = 0;
160 
161 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
162 
164  return new SILoadStoreOptimizer();
165 }
166 
168  ArrayRef<MachineInstr*> InstsToMove) {
169  MachineBasicBlock *MBB = I->getParent();
170  ++I;
171  for (MachineInstr *MI : InstsToMove) {
172  MI->removeFromParent();
173  MBB->insert(I, MI);
174  }
175 }
176 
177 static void addDefsUsesToList(const MachineInstr &MI,
178  DenseSet<unsigned> &RegDefs,
179  DenseSet<unsigned> &PhysRegUses) {
180  for (const MachineOperand &Op : MI.operands()) {
181  if (Op.isReg()) {
182  if (Op.isDef())
183  RegDefs.insert(Op.getReg());
184  else if (Op.readsReg() &&
186  PhysRegUses.insert(Op.getReg());
187  }
188  }
189 }
190 
193  const SIInstrInfo *TII,
194  AliasAnalysis * AA) {
195  // RAW or WAR - cannot reorder
196  // WAW - cannot reorder
197  // RAR - safe to reorder
198  return !(A->mayStore() || B->mayStore()) ||
199  TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
200 }
201 
202 // Add MI and its defs to the lists if MI reads one of the defs that are
203 // already in the list. Returns true in that case.
204 static bool
206  DenseSet<unsigned> &RegDefs,
207  DenseSet<unsigned> &PhysRegUses,
209  for (MachineOperand &Use : MI.operands()) {
210  // If one of the defs is read, then there is a use of Def between I and the
211  // instruction that I will potentially be merged with. We will need to move
212  // this instruction after the merged instructions.
213  //
214  // Similarly, if there is a def which is read by an instruction that is to
215  // be moved for merging, then we need to move the def-instruction as well.
216  // This can only happen for physical registers such as M0; virtual
217  // registers are in SSA form.
218  if (Use.isReg() &&
219  ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
220  (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
221  PhysRegUses.count(Use.getReg())))) {
222  Insts.push_back(&MI);
223  addDefsUsesToList(MI, RegDefs, PhysRegUses);
224  return true;
225  }
226  }
227 
228  return false;
229 }
230 
231 static bool
233  ArrayRef<MachineInstr*> InstsToMove,
234  const SIInstrInfo *TII,
235  AliasAnalysis *AA) {
236  assert(MemOp.mayLoadOrStore());
237 
238  for (MachineInstr *InstToMove : InstsToMove) {
239  if (!InstToMove->mayLoadOrStore())
240  continue;
241  if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
242  return false;
243  }
244  return true;
245 }
246 
247 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
248  // XXX - Would the same offset be OK? Is there any reason this would happen or
249  // be useful?
250  if (CI.Offset0 == CI.Offset1)
251  return false;
252 
253  // This won't be valid if the offset isn't aligned.
254  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
255  return false;
256 
257  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
258  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
259  CI.UseST64 = false;
260  CI.BaseOff = 0;
261 
262  // Handle SMEM and VMEM instructions.
263  if (CI.InstClass != DS_READ_WRITE) {
264  unsigned Diff = CI.IsX2 ? 2 : 1;
265  return (EltOffset0 + Diff == EltOffset1 ||
266  EltOffset1 + Diff == EltOffset0) &&
267  CI.GLC0 == CI.GLC1 &&
268  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
269  }
270 
271  // If the offset in elements doesn't fit in 8-bits, we might be able to use
272  // the stride 64 versions.
273  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
274  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
275  CI.Offset0 = EltOffset0 / 64;
276  CI.Offset1 = EltOffset1 / 64;
277  CI.UseST64 = true;
278  return true;
279  }
280 
281  // Check if the new offsets fit in the reduced 8-bit range.
282  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
283  CI.Offset0 = EltOffset0;
284  CI.Offset1 = EltOffset1;
285  return true;
286  }
287 
288  // Try to shift base address to decrease offsets.
289  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
290  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
291 
292  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
293  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
294  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
295  CI.UseST64 = true;
296  return true;
297  }
298 
299  if (isUInt<8>(OffsetDiff)) {
300  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
301  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
302  return true;
303  }
304 
305  return false;
306 }
307 
308 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
309  MachineBasicBlock *MBB = CI.I->getParent();
311  MachineBasicBlock::iterator MBBI = CI.I;
312 
313  unsigned AddrOpName[3] = {0};
314  int AddrIdx[3];
315  const MachineOperand *AddrReg[3];
316  unsigned NumAddresses = 0;
317 
318  switch (CI.InstClass) {
319  case DS_READ_WRITE:
320  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
321  break;
322  case S_BUFFER_LOAD_IMM:
323  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
324  break;
325  case BUFFER_LOAD_OFFEN:
326  case BUFFER_STORE_OFFEN:
327  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
328  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
329  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
330  break;
331  case BUFFER_LOAD_OFFSET:
332  case BUFFER_STORE_OFFSET:
333  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
334  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
335  break;
336  }
337 
338  for (unsigned i = 0; i < NumAddresses; i++) {
339  AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
340  AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
341 
342  // We only ever merge operations with the same base address register, so don't
343  // bother scanning forward if there are no other uses.
344  if (AddrReg[i]->isReg() &&
346  MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
347  return false;
348  }
349 
350  ++MBBI;
351 
352  DenseSet<unsigned> RegDefsToMove;
353  DenseSet<unsigned> PhysRegUsesToMove;
354  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
355 
356  for ( ; MBBI != E; ++MBBI) {
357  if (MBBI->getOpcode() != CI.I->getOpcode()) {
358  // This is not a matching DS instruction, but we can keep looking as
359  // long as one of these conditions are met:
360  // 1. It is safe to move I down past MBBI.
361  // 2. It is safe to move MBBI down past the instruction that I will
362  // be merged into.
363 
364  if (MBBI->hasUnmodeledSideEffects()) {
365  // We can't re-order this instruction with respect to other memory
366  // operations, so we fail both conditions mentioned above.
367  return false;
368  }
369 
370  if (MBBI->mayLoadOrStore() &&
371  (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
372  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
373  // We fail condition #1, but we may still be able to satisfy condition
374  // #2. Add this instruction to the move list and then we will check
375  // if condition #2 holds once we have selected the matching instruction.
376  CI.InstsToMove.push_back(&*MBBI);
377  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
378  continue;
379  }
380 
381  // When we match I with another DS instruction we will be moving I down
382  // to the location of the matched instruction any uses of I will need to
383  // be moved down as well.
384  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
385  CI.InstsToMove);
386  continue;
387  }
388 
389  // Don't merge volatiles.
390  if (MBBI->hasOrderedMemoryRef())
391  return false;
392 
393  // Handle a case like
394  // DS_WRITE_B32 addr, v, idx0
395  // w = DS_READ_B32 addr, idx0
396  // DS_WRITE_B32 addr, f(w), idx1
397  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
398  // merging of the two writes.
399  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
400  CI.InstsToMove))
401  continue;
402 
403  bool Match = true;
404  for (unsigned i = 0; i < NumAddresses; i++) {
405  const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
406 
407  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
408  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
409  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
410  Match = false;
411  break;
412  }
413  continue;
414  }
415 
416  // Check same base pointer. Be careful of subregisters, which can occur with
417  // vectors of pointers.
418  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
419  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
420  Match = false;
421  break;
422  }
423  }
424 
425  if (Match) {
426  int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
427  AMDGPU::OpName::offset);
428  CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
429  CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
430  CI.Paired = MBBI;
431 
432  if (CI.InstClass == DS_READ_WRITE) {
433  CI.Offset0 &= 0xffff;
434  CI.Offset1 &= 0xffff;
435  } else {
436  CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
437  CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
438  if (CI.InstClass != S_BUFFER_LOAD_IMM) {
439  CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
440  CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
441  }
442  }
443 
444  // Check both offsets fit in the reduced range.
445  // We also need to go through the list of instructions that we plan to
446  // move and make sure they are all safe to move down past the merged
447  // instruction.
448  if (offsetsCanBeCombined(CI))
449  if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
450  return true;
451  }
452 
453  // We've found a load/store that we couldn't merge for some reason.
454  // We could potentially keep looking, but we'd need to make sure that
455  // it was safe to move I and also all the instruction in InstsToMove
456  // down past this instruction.
457  // check if we can move I across MBBI and if we can move all I's users
458  if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
459  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
460  break;
461  }
462  return false;
463 }
464 
465 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
466  if (STM->ldsRequiresM0Init())
467  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
468  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
469 }
470 
471 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
472  if (STM->ldsRequiresM0Init())
473  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
474 
475  return (EltSize == 4) ?
476  AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
477 }
478 
479 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
480  CombineInfo &CI) {
481  MachineBasicBlock *MBB = CI.I->getParent();
482 
483  // Be careful, since the addresses could be subregisters themselves in weird
484  // cases, like vectors of pointers.
485  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
486 
487  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
488  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
489 
490  unsigned NewOffset0 = CI.Offset0;
491  unsigned NewOffset1 = CI.Offset1;
492  unsigned Opc = CI.UseST64 ?
493  read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
494 
495  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
496  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
497 
498  if (NewOffset0 > NewOffset1) {
499  // Canonicalize the merged instruction so the smaller offset comes first.
500  std::swap(NewOffset0, NewOffset1);
501  std::swap(SubRegIdx0, SubRegIdx1);
502  }
503 
504  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
505  (NewOffset0 != NewOffset1) &&
506  "Computed offset doesn't fit");
507 
508  const MCInstrDesc &Read2Desc = TII->get(Opc);
509 
510  const TargetRegisterClass *SuperRC
511  = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
512  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
513 
514  DebugLoc DL = CI.I->getDebugLoc();
515 
516  unsigned BaseReg = AddrReg->getReg();
517  unsigned BaseRegFlags = 0;
518  if (CI.BaseOff) {
519  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
520  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
521  .addImm(CI.BaseOff);
522 
523  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
524  BaseRegFlags = RegState::Kill;
525 
526  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
527  .addReg(ImmReg)
528  .addReg(AddrReg->getReg());
529  }
530 
531  MachineInstrBuilder Read2 =
532  BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
533  .addReg(BaseReg, BaseRegFlags) // addr
534  .addImm(NewOffset0) // offset0
535  .addImm(NewOffset1) // offset1
536  .addImm(0) // gds
537  .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
538 
539  (void)Read2;
540 
541  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
542 
543  // Copy to the old destination registers.
544  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
545  .add(*Dest0) // Copy to same destination including flags and sub reg.
546  .addReg(DestReg, 0, SubRegIdx0);
547  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
548  .add(*Dest1)
549  .addReg(DestReg, RegState::Kill, SubRegIdx1);
550 
551  moveInstsAfter(Copy1, CI.InstsToMove);
552 
553  MachineBasicBlock::iterator Next = std::next(CI.I);
554  CI.I->eraseFromParent();
555  CI.Paired->eraseFromParent();
556 
557  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
558  return Next;
559 }
560 
561 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
562  if (STM->ldsRequiresM0Init())
563  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
564  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
565 }
566 
567 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
568  if (STM->ldsRequiresM0Init())
569  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
570 
571  return (EltSize == 4) ?
572  AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
573 }
574 
575 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
576  CombineInfo &CI) {
577  MachineBasicBlock *MBB = CI.I->getParent();
578 
579  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
580  // sure we preserve the subregister index and any register flags set on them.
581  const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
582  const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
583  const MachineOperand *Data1
584  = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
585 
586  unsigned NewOffset0 = CI.Offset0;
587  unsigned NewOffset1 = CI.Offset1;
588  unsigned Opc = CI.UseST64 ?
589  write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
590 
591  if (NewOffset0 > NewOffset1) {
592  // Canonicalize the merged instruction so the smaller offset comes first.
593  std::swap(NewOffset0, NewOffset1);
594  std::swap(Data0, Data1);
595  }
596 
597  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
598  (NewOffset0 != NewOffset1) &&
599  "Computed offset doesn't fit");
600 
601  const MCInstrDesc &Write2Desc = TII->get(Opc);
602  DebugLoc DL = CI.I->getDebugLoc();
603 
604  unsigned BaseReg = AddrReg->getReg();
605  unsigned BaseRegFlags = 0;
606  if (CI.BaseOff) {
607  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
608  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
609  .addImm(CI.BaseOff);
610 
611  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
612  BaseRegFlags = RegState::Kill;
613 
614  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
615  .addReg(ImmReg)
616  .addReg(AddrReg->getReg());
617  }
618 
619  MachineInstrBuilder Write2 =
620  BuildMI(*MBB, CI.Paired, DL, Write2Desc)
621  .addReg(BaseReg, BaseRegFlags) // addr
622  .add(*Data0) // data0
623  .add(*Data1) // data1
624  .addImm(NewOffset0) // offset0
625  .addImm(NewOffset1) // offset1
626  .addImm(0) // gds
627  .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
628 
629  moveInstsAfter(Write2, CI.InstsToMove);
630 
631  MachineBasicBlock::iterator Next = std::next(CI.I);
632  CI.I->eraseFromParent();
633  CI.Paired->eraseFromParent();
634 
635  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
636  return Next;
637 }
638 
639 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
640  CombineInfo &CI) {
641  MachineBasicBlock *MBB = CI.I->getParent();
642  DebugLoc DL = CI.I->getDebugLoc();
643  unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
644  AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
645 
646  const TargetRegisterClass *SuperRC =
647  CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
648  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
649  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
650 
651  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
652  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
653  .addImm(MergedOffset) // offset
654  .addImm(CI.GLC0) // glc
655  .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
656 
657  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
658  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
659 
660  // Handle descending offsets
661  if (CI.Offset0 > CI.Offset1)
662  std::swap(SubRegIdx0, SubRegIdx1);
663 
664  // Copy to the old destination registers.
665  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
666  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
667  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
668 
669  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
670  .add(*Dest0) // Copy to same destination including flags and sub reg.
671  .addReg(DestReg, 0, SubRegIdx0);
672  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
673  .add(*Dest1)
674  .addReg(DestReg, RegState::Kill, SubRegIdx1);
675 
676  moveInstsAfter(Copy1, CI.InstsToMove);
677 
678  MachineBasicBlock::iterator Next = std::next(CI.I);
679  CI.I->eraseFromParent();
680  CI.Paired->eraseFromParent();
681  return Next;
682 }
683 
684 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
685  CombineInfo &CI) {
686  MachineBasicBlock *MBB = CI.I->getParent();
687  DebugLoc DL = CI.I->getDebugLoc();
688  unsigned Opcode;
689 
690  if (CI.InstClass == BUFFER_LOAD_OFFEN) {
691  Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
692  AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
693  } else {
694  Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
695  AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
696  }
697 
698  const TargetRegisterClass *SuperRC =
699  CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
700  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
701  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
702 
703  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
704 
705  if (CI.InstClass == BUFFER_LOAD_OFFEN)
706  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
707 
708  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
709  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
710  .addImm(MergedOffset) // offset
711  .addImm(CI.GLC0) // glc
712  .addImm(CI.SLC0) // slc
713  .addImm(0) // tfe
714  .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
715 
716  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
717  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
718 
719  // Handle descending offsets
720  if (CI.Offset0 > CI.Offset1)
721  std::swap(SubRegIdx0, SubRegIdx1);
722 
723  // Copy to the old destination registers.
724  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
725  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
726  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
727 
728  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
729  .add(*Dest0) // Copy to same destination including flags and sub reg.
730  .addReg(DestReg, 0, SubRegIdx0);
731  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
732  .add(*Dest1)
733  .addReg(DestReg, RegState::Kill, SubRegIdx1);
734 
735  moveInstsAfter(Copy1, CI.InstsToMove);
736 
737  MachineBasicBlock::iterator Next = std::next(CI.I);
738  CI.I->eraseFromParent();
739  CI.Paired->eraseFromParent();
740  return Next;
741 }
742 
743 unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
744  const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
745  IsX2 = false;
746  IsOffen = false;
747 
748  switch (I.getOpcode()) {
749  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
750  IsOffen = true;
751  return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
752  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
753  IsOffen = true;
754  return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
755  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
756  IsX2 = true;
757  IsOffen = true;
758  return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
759  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
760  IsX2 = true;
761  IsOffen = true;
762  return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
763  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
764  return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
765  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
766  return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
767  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
768  IsX2 = true;
769  return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
770  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
771  IsX2 = true;
772  return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
773  }
774  return 0;
775 }
776 
777 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
778  CombineInfo &CI) {
779  MachineBasicBlock *MBB = CI.I->getParent();
780  DebugLoc DL = CI.I->getDebugLoc();
781  bool Unused1, Unused2;
782  unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
783 
784  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
785  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
786 
787  // Handle descending offsets
788  if (CI.Offset0 > CI.Offset1)
789  std::swap(SubRegIdx0, SubRegIdx1);
790 
791  // Copy to the new source register.
792  const TargetRegisterClass *SuperRC =
793  CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
794  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
795 
796  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
797  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
798 
799  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
800  .add(*Src0)
801  .addImm(SubRegIdx0)
802  .add(*Src1)
803  .addImm(SubRegIdx1);
804 
805  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
806  .addReg(SrcReg, RegState::Kill);
807 
808  if (CI.InstClass == BUFFER_STORE_OFFEN)
809  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
810 
811  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
812  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
813  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
814  .addImm(CI.GLC0) // glc
815  .addImm(CI.SLC0) // slc
816  .addImm(0) // tfe
817  .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
818 
819  moveInstsAfter(MIB, CI.InstsToMove);
820 
821  MachineBasicBlock::iterator Next = std::next(CI.I);
822  CI.I->eraseFromParent();
823  CI.Paired->eraseFromParent();
824  return Next;
825 }
826 
827 // Scan through looking for adjacent LDS operations with constant offsets from
828 // the same base register. We rely on the scheduler to do the hard work of
829 // clustering nearby loads, and assume these are all adjacent.
830 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
831  bool Modified = false;
832 
833  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
834  MachineInstr &MI = *I;
835 
836  // Don't combine if volatile.
837  if (MI.hasOrderedMemoryRef()) {
838  ++I;
839  continue;
840  }
841 
842  CombineInfo CI;
843  CI.I = I;
844  unsigned Opc = MI.getOpcode();
845  if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
846  Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
847 
848  CI.InstClass = DS_READ_WRITE;
849  CI.EltSize =
850  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
851 
852  if (findMatchingInst(CI)) {
853  Modified = true;
854  I = mergeRead2Pair(CI);
855  } else {
856  ++I;
857  }
858 
859  continue;
860  } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
861  Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
862  Opc == AMDGPU::DS_WRITE_B64_gfx9) {
863  CI.InstClass = DS_READ_WRITE;
864  CI.EltSize
865  = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
866 
867  if (findMatchingInst(CI)) {
868  Modified = true;
869  I = mergeWrite2Pair(CI);
870  } else {
871  ++I;
872  }
873 
874  continue;
875  }
876  if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
877  Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
878  // EltSize is in units of the offset encoding.
879  CI.InstClass = S_BUFFER_LOAD_IMM;
880  CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
881  CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
882  if (findMatchingInst(CI)) {
883  Modified = true;
884  I = mergeSBufferLoadImmPair(CI);
885  if (!CI.IsX2)
886  CreatedX2++;
887  } else {
888  ++I;
889  }
890  continue;
891  }
892  if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
893  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
894  Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
895  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
896  if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
897  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
898  CI.InstClass = BUFFER_LOAD_OFFEN;
899  else
900  CI.InstClass = BUFFER_LOAD_OFFSET;
901 
902  CI.EltSize = 4;
903  CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
904  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
905  if (findMatchingInst(CI)) {
906  Modified = true;
907  I = mergeBufferLoadPair(CI);
908  if (!CI.IsX2)
909  CreatedX2++;
910  } else {
911  ++I;
912  }
913  continue;
914  }
915 
916  bool StoreIsX2, IsOffen;
917  if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
918  CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
919  CI.EltSize = 4;
920  CI.IsX2 = StoreIsX2;
921  if (findMatchingInst(CI)) {
922  Modified = true;
923  I = mergeBufferStorePair(CI);
924  if (!CI.IsX2)
925  CreatedX2++;
926  } else {
927  ++I;
928  }
929  continue;
930  }
931 
932  ++I;
933  }
934 
935  return Modified;
936 }
937 
938 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
939  if (skipFunction(MF.getFunction()))
940  return false;
941 
942  STM = &MF.getSubtarget<SISubtarget>();
943  if (!STM->loadStoreOptEnabled())
944  return false;
945 
946  TII = STM->getInstrInfo();
947  TRI = &TII->getRegisterInfo();
948 
949  MRI = &MF.getRegInfo();
950  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
951 
952  assert(MRI->isSSA() && "Must be run on SSA");
953 
954  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
955 
956  bool Modified = false;
957 
958  for (MachineBasicBlock &MBB : MF) {
959  CreatedX2 = 0;
960  Modified |= optimizeBlock(MBB);
961 
962  // Run again to convert x2 to x4.
963  if (CreatedX2 >= 1)
964  Modified |= optimizeBlock(MBB);
965  }
966 
967  return Modified;
968 }
static bool isReg(const MCInst &MI, unsigned OpNo)
void push_back(const T &Elt)
Definition: SmallVector.h:213
const MachineInstrBuilder & add(const MachineOperand &MO) const
Interface definition for SIRegisterInfo.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
char & SILoadStoreOptimizerID
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:161
unsigned getReg() const
getReg - Returns the register number.
unsigned getSubReg() const
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:682
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:361
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, AliasAnalysis *AA)
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:311
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:335
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:187
Represent the analysis usage information of a pass.
SI Load Store Optimizer
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:861
bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA=nullptr) const override
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:924
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:60
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1213
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, const SIInstrInfo *TII, AliasAnalysis *AA)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:91
FunctionPass * createSILoadStoreOptimizerPass()
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const MachineInstrBuilder & setMemRefs(MachineInstr::mmo_iterator b, MachineInstr::mmo_iterator e) const
#define LLVM_DEBUG(X)
Definition: Debug.h:119
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)