LLVM  6.0.0svn
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass tries to fuse DS instructions with close by immediate offsets.
11 // This will fuse operations such as
12 // ds_read_b32 v0, v2 offset:16
13 // ds_read_b32 v1, v2 offset:32
14 // ==>
15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 //
17 // The same is done for certain SMEM opcodes, e.g.:
18 // s_buffer_load_dword s4, s[0:3], 4
19 // s_buffer_load_dword s5, s[0:3], 8
20 // ==>
21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 //
23 //
24 // Future improvements:
25 //
26 // - This currently relies on the scheduler to place loads and stores next to
27 // each other, and then only merges adjacent pairs of instructions. It would
28 // be good to be more flexible with interleaved instructions, and possibly run
29 // before scheduling. It currently missing stores of constants because loading
30 // the constant into the data register is placed between the stores, although
31 // this is arguably a scheduling problem.
32 //
33 // - Live interval recomputing seems inefficient. This currently only matches
34 // one pair, and recomputes live intervals and moves on to the next pair. It
35 // would be better to compute a list of all merges that need to occur.
36 //
37 // - With a list of instructions to process, we can also merge more. If a
38 // cluster of loads have offsets that are too large to fit in the 8-bit
39 // offsets, but are close enough to fit in the 8 bits, we can add to the base
40 // pointer and use the new reduced offsets.
41 //
42 //===----------------------------------------------------------------------===//
43 
44 #include "AMDGPU.h"
45 #include "AMDGPUSubtarget.h"
46 #include "SIInstrInfo.h"
47 #include "SIRegisterInfo.h"
48 #include "Utils/AMDGPUBaseInfo.h"
49 #include "llvm/ADT/ArrayRef.h"
50 #include "llvm/ADT/SmallVector.h"
51 #include "llvm/ADT/StringRef.h"
60 #include "llvm/IR/DebugLoc.h"
61 #include "llvm/Pass.h"
62 #include "llvm/Support/Debug.h"
65 #include <algorithm>
66 #include <cassert>
67 #include <cstdlib>
68 #include <iterator>
69 #include <utility>
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "si-load-store-opt"
74 
75 namespace {
76 
77 class SILoadStoreOptimizer : public MachineFunctionPass {
78  enum InstClassEnum {
79  DS_READ_WRITE,
80  S_BUFFER_LOAD_IMM,
81  BUFFER_LOAD_OFFEN,
82  BUFFER_LOAD_OFFSET,
83  BUFFER_STORE_OFFEN,
84  BUFFER_STORE_OFFSET,
85  };
86 
87  struct CombineInfo {
90  unsigned EltSize;
91  unsigned Offset0;
92  unsigned Offset1;
93  unsigned BaseOff;
94  InstClassEnum InstClass;
95  bool GLC0;
96  bool GLC1;
97  bool SLC0;
98  bool SLC1;
99  bool UseST64;
100  bool IsX2;
101  SmallVector<MachineInstr*, 8> InstsToMove;
102  };
103 
104 private:
105  const SISubtarget *STM = nullptr;
106  const SIInstrInfo *TII = nullptr;
107  const SIRegisterInfo *TRI = nullptr;
108  MachineRegisterInfo *MRI = nullptr;
109  AliasAnalysis *AA = nullptr;
110  unsigned CreatedX2;
111 
112  static bool offsetsCanBeCombined(CombineInfo &CI);
113 
114  bool findMatchingInst(CombineInfo &CI);
115  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
116  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
117  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
118  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
119  unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
120  bool &IsOffen) const;
121  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
122 
123 public:
124  static char ID;
125 
126  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
128  }
129 
130  bool optimizeBlock(MachineBasicBlock &MBB);
131 
132  bool runOnMachineFunction(MachineFunction &MF) override;
133 
134  StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
135 
136  void getAnalysisUsage(AnalysisUsage &AU) const override {
137  AU.setPreservesCFG();
139 
141  }
142 };
143 
144 } // end anonymous namespace.
145 
146 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
147  "SI Load / Store Optimizer", false, false)
149 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
151 
152 char SILoadStoreOptimizer::ID = 0;
153 
154 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
155 
157  return new SILoadStoreOptimizer();
158 }
159 
161  ArrayRef<MachineInstr*> InstsToMove) {
162  MachineBasicBlock *MBB = I->getParent();
163  ++I;
164  for (MachineInstr *MI : InstsToMove) {
165  MI->removeFromParent();
166  MBB->insert(I, MI);
167  }
168 }
169 
170 static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
171  // XXX: Should this be looking for implicit defs?
172  for (const MachineOperand &Def : MI.defs())
173  Defs.insert(Def.getReg());
174 }
175 
178  const SIInstrInfo *TII,
179  AliasAnalysis * AA) {
180  // RAW or WAR - cannot reorder
181  // WAW - cannot reorder
182  // RAR - safe to reorder
183  return !(A->mayStore() || B->mayStore()) ||
184  TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
185 }
186 
187 // Add MI and its defs to the lists if MI reads one of the defs that are
188 // already in the list. Returns true in that case.
189 static bool
191  DenseSet<unsigned> &Defs,
193  for (MachineOperand &Use : MI.operands()) {
194  // If one of the defs is read, then there is a use of Def between I and the
195  // instruction that I will potentially be merged with. We will need to move
196  // this instruction after the merged instructions.
197 
198  if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
199  Insts.push_back(&MI);
200  addDefsToList(MI, Defs);
201  return true;
202  }
203  }
204 
205  return false;
206 }
207 
208 static bool
210  ArrayRef<MachineInstr*> InstsToMove,
211  const SIInstrInfo *TII,
212  AliasAnalysis *AA) {
213  assert(MemOp.mayLoadOrStore());
214 
215  for (MachineInstr *InstToMove : InstsToMove) {
216  if (!InstToMove->mayLoadOrStore())
217  continue;
218  if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
219  return false;
220  }
221  return true;
222 }
223 
224 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
225  // XXX - Would the same offset be OK? Is there any reason this would happen or
226  // be useful?
227  if (CI.Offset0 == CI.Offset1)
228  return false;
229 
230  // This won't be valid if the offset isn't aligned.
231  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
232  return false;
233 
234  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
235  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
236  CI.UseST64 = false;
237  CI.BaseOff = 0;
238 
239  // Handle SMEM and VMEM instructions.
240  if (CI.InstClass != DS_READ_WRITE) {
241  unsigned Diff = CI.IsX2 ? 2 : 1;
242  return (EltOffset0 + Diff == EltOffset1 ||
243  EltOffset1 + Diff == EltOffset0) &&
244  CI.GLC0 == CI.GLC1 &&
245  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
246  }
247 
248  // If the offset in elements doesn't fit in 8-bits, we might be able to use
249  // the stride 64 versions.
250  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
251  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
252  CI.Offset0 = EltOffset0 / 64;
253  CI.Offset1 = EltOffset1 / 64;
254  CI.UseST64 = true;
255  return true;
256  }
257 
258  // Check if the new offsets fit in the reduced 8-bit range.
259  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
260  CI.Offset0 = EltOffset0;
261  CI.Offset1 = EltOffset1;
262  return true;
263  }
264 
265  // Try to shift base address to decrease offsets.
266  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
267  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
268 
269  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
270  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
271  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
272  CI.UseST64 = true;
273  return true;
274  }
275 
276  if (isUInt<8>(OffsetDiff)) {
277  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
278  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
279  return true;
280  }
281 
282  return false;
283 }
284 
285 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
286  MachineBasicBlock *MBB = CI.I->getParent();
288  MachineBasicBlock::iterator MBBI = CI.I;
289 
290  unsigned AddrOpName[3] = {0};
291  int AddrIdx[3];
292  const MachineOperand *AddrReg[3];
293  unsigned NumAddresses = 0;
294 
295  switch (CI.InstClass) {
296  case DS_READ_WRITE:
297  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
298  break;
299  case S_BUFFER_LOAD_IMM:
300  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
301  break;
302  case BUFFER_LOAD_OFFEN:
303  case BUFFER_STORE_OFFEN:
304  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
305  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
306  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
307  break;
308  case BUFFER_LOAD_OFFSET:
309  case BUFFER_STORE_OFFSET:
310  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
311  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
312  break;
313  }
314 
315  for (unsigned i = 0; i < NumAddresses; i++) {
316  AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
317  AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
318 
319  // We only ever merge operations with the same base address register, so don't
320  // bother scanning forward if there are no other uses.
321  if (AddrReg[i]->isReg() &&
323  MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
324  return false;
325  }
326 
327  ++MBBI;
328 
329  DenseSet<unsigned> DefsToMove;
330  addDefsToList(*CI.I, DefsToMove);
331 
332  for ( ; MBBI != E; ++MBBI) {
333  if (MBBI->getOpcode() != CI.I->getOpcode()) {
334  // This is not a matching DS instruction, but we can keep looking as
335  // long as one of these conditions are met:
336  // 1. It is safe to move I down past MBBI.
337  // 2. It is safe to move MBBI down past the instruction that I will
338  // be merged into.
339 
340  if (MBBI->hasUnmodeledSideEffects()) {
341  // We can't re-order this instruction with respect to other memory
342  // operations, so we fail both conditions mentioned above.
343  return false;
344  }
345 
346  if (MBBI->mayLoadOrStore() &&
347  !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) {
348  // We fail condition #1, but we may still be able to satisfy condition
349  // #2. Add this instruction to the move list and then we will check
350  // if condition #2 holds once we have selected the matching instruction.
351  CI.InstsToMove.push_back(&*MBBI);
352  addDefsToList(*MBBI, DefsToMove);
353  continue;
354  }
355 
356  // When we match I with another DS instruction we will be moving I down
357  // to the location of the matched instruction any uses of I will need to
358  // be moved down as well.
359  addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
360  continue;
361  }
362 
363  // Don't merge volatiles.
364  if (MBBI->hasOrderedMemoryRef())
365  return false;
366 
367  // Handle a case like
368  // DS_WRITE_B32 addr, v, idx0
369  // w = DS_READ_B32 addr, idx0
370  // DS_WRITE_B32 addr, f(w), idx1
371  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
372  // merging of the two writes.
373  if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
374  continue;
375 
376  bool Match = true;
377  for (unsigned i = 0; i < NumAddresses; i++) {
378  const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
379 
380  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
381  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
382  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
383  Match = false;
384  break;
385  }
386  continue;
387  }
388 
389  // Check same base pointer. Be careful of subregisters, which can occur with
390  // vectors of pointers.
391  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
392  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
393  Match = false;
394  break;
395  }
396  }
397 
398  if (Match) {
399  int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
400  AMDGPU::OpName::offset);
401  CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
402  CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
403  CI.Paired = MBBI;
404 
405  if (CI.InstClass == DS_READ_WRITE) {
406  CI.Offset0 &= 0xffff;
407  CI.Offset1 &= 0xffff;
408  } else {
409  CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
410  CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
411  if (CI.InstClass != S_BUFFER_LOAD_IMM) {
412  CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
413  CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
414  }
415  }
416 
417  // Check both offsets fit in the reduced range.
418  // We also need to go through the list of instructions that we plan to
419  // move and make sure they are all safe to move down past the merged
420  // instruction.
421  if (offsetsCanBeCombined(CI))
422  if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
423  return true;
424  }
425 
426  // We've found a load/store that we couldn't merge for some reason.
427  // We could potentially keep looking, but we'd need to make sure that
428  // it was safe to move I and also all the instruction in InstsToMove
429  // down past this instruction.
430  // check if we can move I across MBBI and if we can move all I's users
431  if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
432  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
433  break;
434  }
435  return false;
436 }
437 
438 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
439  CombineInfo &CI) {
440  MachineBasicBlock *MBB = CI.I->getParent();
441 
442  // Be careful, since the addresses could be subregisters themselves in weird
443  // cases, like vectors of pointers.
444  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
445 
446  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
447  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
448 
449  unsigned NewOffset0 = CI.Offset0;
450  unsigned NewOffset1 = CI.Offset1;
451  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
452  : AMDGPU::DS_READ2_B64;
453 
454  if (CI.UseST64)
455  Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
456  : AMDGPU::DS_READ2ST64_B64;
457 
458  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
459  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
460 
461  if (NewOffset0 > NewOffset1) {
462  // Canonicalize the merged instruction so the smaller offset comes first.
463  std::swap(NewOffset0, NewOffset1);
464  std::swap(SubRegIdx0, SubRegIdx1);
465  }
466 
467  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
468  (NewOffset0 != NewOffset1) &&
469  "Computed offset doesn't fit");
470 
471  const MCInstrDesc &Read2Desc = TII->get(Opc);
472 
473  const TargetRegisterClass *SuperRC
474  = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
475  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
476 
477  DebugLoc DL = CI.I->getDebugLoc();
478 
479  unsigned BaseReg = AddrReg->getReg();
480  unsigned BaseRegFlags = 0;
481  if (CI.BaseOff) {
482  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
483  BaseRegFlags = RegState::Kill;
484  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
485  .addImm(CI.BaseOff)
486  .addReg(AddrReg->getReg());
487  }
488 
489  MachineInstrBuilder Read2 =
490  BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
491  .addReg(BaseReg, BaseRegFlags) // addr
492  .addImm(NewOffset0) // offset0
493  .addImm(NewOffset1) // offset1
494  .addImm(0) // gds
495  .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
496 
497  (void)Read2;
498 
499  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
500 
501  // Copy to the old destination registers.
502  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
503  .add(*Dest0) // Copy to same destination including flags and sub reg.
504  .addReg(DestReg, 0, SubRegIdx0);
505  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
506  .add(*Dest1)
507  .addReg(DestReg, RegState::Kill, SubRegIdx1);
508 
509  moveInstsAfter(Copy1, CI.InstsToMove);
510 
511  MachineBasicBlock::iterator Next = std::next(CI.I);
512  CI.I->eraseFromParent();
513  CI.Paired->eraseFromParent();
514 
515  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
516  return Next;
517 }
518 
519 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
520  CombineInfo &CI) {
521  MachineBasicBlock *MBB = CI.I->getParent();
522 
523  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
524  // sure we preserve the subregister index and any register flags set on them.
525  const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
526  const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
527  const MachineOperand *Data1
528  = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
529 
530  unsigned NewOffset0 = CI.Offset0;
531  unsigned NewOffset1 = CI.Offset1;
532  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
533  : AMDGPU::DS_WRITE2_B64;
534 
535  if (CI.UseST64)
536  Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
537  : AMDGPU::DS_WRITE2ST64_B64;
538 
539  if (NewOffset0 > NewOffset1) {
540  // Canonicalize the merged instruction so the smaller offset comes first.
541  std::swap(NewOffset0, NewOffset1);
542  std::swap(Data0, Data1);
543  }
544 
545  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
546  (NewOffset0 != NewOffset1) &&
547  "Computed offset doesn't fit");
548 
549  const MCInstrDesc &Write2Desc = TII->get(Opc);
550  DebugLoc DL = CI.I->getDebugLoc();
551 
552  unsigned BaseReg = Addr->getReg();
553  unsigned BaseRegFlags = 0;
554  if (CI.BaseOff) {
555  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
556  BaseRegFlags = RegState::Kill;
557  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
558  .addImm(CI.BaseOff)
559  .addReg(Addr->getReg());
560  }
561 
562  MachineInstrBuilder Write2 =
563  BuildMI(*MBB, CI.Paired, DL, Write2Desc)
564  .addReg(BaseReg, BaseRegFlags) // addr
565  .add(*Data0) // data0
566  .add(*Data1) // data1
567  .addImm(NewOffset0) // offset0
568  .addImm(NewOffset1) // offset1
569  .addImm(0) // gds
570  .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
571 
572  moveInstsAfter(Write2, CI.InstsToMove);
573 
574  MachineBasicBlock::iterator Next = std::next(CI.I);
575  CI.I->eraseFromParent();
576  CI.Paired->eraseFromParent();
577 
578  DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
579  return Next;
580 }
581 
582 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
583  CombineInfo &CI) {
584  MachineBasicBlock *MBB = CI.I->getParent();
585  DebugLoc DL = CI.I->getDebugLoc();
586  unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
587  AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
588 
589  const TargetRegisterClass *SuperRC =
590  CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
591  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
592  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
593 
594  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
595  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
596  .addImm(MergedOffset) // offset
597  .addImm(CI.GLC0) // glc
598  .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
599 
600  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
601  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
602 
603  // Handle descending offsets
604  if (CI.Offset0 > CI.Offset1)
605  std::swap(SubRegIdx0, SubRegIdx1);
606 
607  // Copy to the old destination registers.
608  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
609  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
610  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
611 
612  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
613  .add(*Dest0) // Copy to same destination including flags and sub reg.
614  .addReg(DestReg, 0, SubRegIdx0);
615  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
616  .add(*Dest1)
617  .addReg(DestReg, RegState::Kill, SubRegIdx1);
618 
619  moveInstsAfter(Copy1, CI.InstsToMove);
620 
621  MachineBasicBlock::iterator Next = std::next(CI.I);
622  CI.I->eraseFromParent();
623  CI.Paired->eraseFromParent();
624  return Next;
625 }
626 
627 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
628  CombineInfo &CI) {
629  MachineBasicBlock *MBB = CI.I->getParent();
630  DebugLoc DL = CI.I->getDebugLoc();
631  unsigned Opcode;
632 
633  if (CI.InstClass == BUFFER_LOAD_OFFEN) {
634  Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
635  AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
636  } else {
637  Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
638  AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
639  }
640 
641  const TargetRegisterClass *SuperRC =
642  CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
643  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
644  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
645 
646  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
647 
648  if (CI.InstClass == BUFFER_LOAD_OFFEN)
649  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
650 
651  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
652  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
653  .addImm(MergedOffset) // offset
654  .addImm(CI.GLC0) // glc
655  .addImm(CI.SLC0) // slc
656  .addImm(0) // tfe
657  .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
658 
659  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
660  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
661 
662  // Handle descending offsets
663  if (CI.Offset0 > CI.Offset1)
664  std::swap(SubRegIdx0, SubRegIdx1);
665 
666  // Copy to the old destination registers.
667  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
668  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
669  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
670 
671  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
672  .add(*Dest0) // Copy to same destination including flags and sub reg.
673  .addReg(DestReg, 0, SubRegIdx0);
674  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
675  .add(*Dest1)
676  .addReg(DestReg, RegState::Kill, SubRegIdx1);
677 
678  moveInstsAfter(Copy1, CI.InstsToMove);
679 
680  MachineBasicBlock::iterator Next = std::next(CI.I);
681  CI.I->eraseFromParent();
682  CI.Paired->eraseFromParent();
683  return Next;
684 }
685 
686 unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
687  const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
688  IsX2 = false;
689  IsOffen = false;
690 
691  switch (I.getOpcode()) {
692  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
693  IsOffen = true;
694  return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
695  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
696  IsOffen = true;
697  return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
698  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
699  IsX2 = true;
700  IsOffen = true;
701  return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
702  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
703  IsX2 = true;
704  IsOffen = true;
705  return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
706  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
707  return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
708  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
709  return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
710  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
711  IsX2 = true;
712  return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
713  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
714  IsX2 = true;
715  return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
716  }
717  return 0;
718 }
719 
720 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
721  CombineInfo &CI) {
722  MachineBasicBlock *MBB = CI.I->getParent();
723  DebugLoc DL = CI.I->getDebugLoc();
724  bool Unused1, Unused2;
725  unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
726 
727  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
728  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
729 
730  // Handle descending offsets
731  if (CI.Offset0 > CI.Offset1)
732  std::swap(SubRegIdx0, SubRegIdx1);
733 
734  // Copy to the new source register.
735  const TargetRegisterClass *SuperRC =
736  CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
737  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
738 
739  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
740  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
741 
742  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
743  .add(*Src0)
744  .addImm(SubRegIdx0)
745  .add(*Src1)
746  .addImm(SubRegIdx1);
747 
748  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
749  .addReg(SrcReg, RegState::Kill);
750 
751  if (CI.InstClass == BUFFER_STORE_OFFEN)
752  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
753 
754  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
755  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
756  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
757  .addImm(CI.GLC0) // glc
758  .addImm(CI.SLC0) // slc
759  .addImm(0) // tfe
760  .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
761 
762  moveInstsAfter(MIB, CI.InstsToMove);
763 
764  MachineBasicBlock::iterator Next = std::next(CI.I);
765  CI.I->eraseFromParent();
766  CI.Paired->eraseFromParent();
767  return Next;
768 }
769 
770 // Scan through looking for adjacent LDS operations with constant offsets from
771 // the same base register. We rely on the scheduler to do the hard work of
772 // clustering nearby loads, and assume these are all adjacent.
773 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
774  bool Modified = false;
775 
776  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
777  MachineInstr &MI = *I;
778 
779  // Don't combine if volatile.
780  if (MI.hasOrderedMemoryRef()) {
781  ++I;
782  continue;
783  }
784 
785  CombineInfo CI;
786  CI.I = I;
787  unsigned Opc = MI.getOpcode();
788  if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
789  CI.InstClass = DS_READ_WRITE;
790  CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
791  if (findMatchingInst(CI)) {
792  Modified = true;
793  I = mergeRead2Pair(CI);
794  } else {
795  ++I;
796  }
797 
798  continue;
799  }
800  if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
801  CI.InstClass = DS_READ_WRITE;
802  CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
803  if (findMatchingInst(CI)) {
804  Modified = true;
805  I = mergeWrite2Pair(CI);
806  } else {
807  ++I;
808  }
809 
810  continue;
811  }
812  if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
813  (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
814  Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
815  // EltSize is in units of the offset encoding.
816  CI.InstClass = S_BUFFER_LOAD_IMM;
817  CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
818  CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
819  if (findMatchingInst(CI)) {
820  Modified = true;
821  I = mergeSBufferLoadImmPair(CI);
822  if (!CI.IsX2)
823  CreatedX2++;
824  } else {
825  ++I;
826  }
827  continue;
828  }
829  if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
830  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
831  Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
832  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
833  if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
834  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
835  CI.InstClass = BUFFER_LOAD_OFFEN;
836  else
837  CI.InstClass = BUFFER_LOAD_OFFSET;
838 
839  CI.EltSize = 4;
840  CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
841  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
842  if (findMatchingInst(CI)) {
843  Modified = true;
844  I = mergeBufferLoadPair(CI);
845  if (!CI.IsX2)
846  CreatedX2++;
847  } else {
848  ++I;
849  }
850  continue;
851  }
852 
853  bool StoreIsX2, IsOffen;
854  if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
855  CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
856  CI.EltSize = 4;
857  CI.IsX2 = StoreIsX2;
858  if (findMatchingInst(CI)) {
859  Modified = true;
860  I = mergeBufferStorePair(CI);
861  if (!CI.IsX2)
862  CreatedX2++;
863  } else {
864  ++I;
865  }
866  continue;
867  }
868 
869  ++I;
870  }
871 
872  return Modified;
873 }
874 
875 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
876  if (skipFunction(*MF.getFunction()))
877  return false;
878 
879  STM = &MF.getSubtarget<SISubtarget>();
880  if (!STM->loadStoreOptEnabled())
881  return false;
882 
883  TII = STM->getInstrInfo();
884  TRI = &TII->getRegisterInfo();
885 
886  MRI = &MF.getRegInfo();
887  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
888 
889  assert(MRI->isSSA() && "Must be run on SSA");
890 
891  DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
892 
893  bool Modified = false;
894  CreatedX2 = 0;
895 
896  for (MachineBasicBlock &MBB : MF)
897  Modified |= optimizeBlock(MBB);
898 
899  // Run again to convert x2 to x4.
900  if (CreatedX2 >= 1) {
901  for (MachineBasicBlock &MBB : MF)
902  Modified |= optimizeBlock(MBB);
903  }
904 
905  return Modified;
906 }
static bool isReg(const MCInst &MI, unsigned OpNo)
void push_back(const T &Elt)
Definition: SmallVector.h:212
const MachineInstrBuilder & add(const MachineOperand &MO) const
Interface definition for SIRegisterInfo.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
AMDGPU specific subclass of TargetSubtarget.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
char & SILoadStoreOptimizerID
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:163
unsigned getReg() const
getReg - Returns the register number.
unsigned getSubReg() const
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:649
A debug info location.
Definition: DebugLoc.h:34
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:332
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, AliasAnalysis *AA)
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:290
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &Defs, SmallVectorImpl< MachineInstr *> &Insts)
static void addDefsToList(const MachineInstr &MI, DenseSet< unsigned > &Defs)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:335
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:187
Represent the analysis usage information of a pass.
SI Load Store Optimizer
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:354
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:864
bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA=nullptr) const override
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:285
int64_t getImm() const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:923
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
void initializeSILoadStoreOptimizerPass(PassRegistry &)
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:59
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1213
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, const SIInstrInfo *TII, AliasAnalysis *AA)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:91
const Function * getFunction() const
getFunction - Return the LLVM function that this machine code represents
FunctionPass * createSILoadStoreOptimizerPass()
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
#define DEBUG(X)
Definition: Debug.h:118
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const MachineInstrBuilder & setMemRefs(MachineInstr::mmo_iterator b, MachineInstr::mmo_iterator e) const