LLVM  14.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73  UNKNOWN,
74  DS_READ,
75  DS_WRITE,
76  S_BUFFER_LOAD_IMM,
79  MIMG,
80  TBUFFER_LOAD,
81  TBUFFER_STORE,
82 };
83 
84 struct AddressRegs {
85  unsigned char NumVAddrs = 0;
86  bool SBase = false;
87  bool SRsrc = false;
88  bool SOffset = false;
89  bool VAddr = false;
90  bool Addr = false;
91  bool SSamp = false;
92 };
93 
94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
95 const unsigned MaxAddressRegs = 12 + 1 + 1;
96 
97 class SILoadStoreOptimizer : public MachineFunctionPass {
98  struct CombineInfo {
100  unsigned EltSize;
101  unsigned Offset;
102  unsigned Width;
103  unsigned Format;
104  unsigned BaseOff;
105  unsigned DMask;
106  InstClassEnum InstClass;
107  unsigned CPol = 0;
108  bool UseST64;
109  int AddrIdx[MaxAddressRegs];
110  const MachineOperand *AddrReg[MaxAddressRegs];
111  unsigned NumAddresses;
112  unsigned Order;
113 
114  bool hasSameBaseAddress(const MachineInstr &MI) {
115  for (unsigned i = 0; i < NumAddresses; i++) {
116  const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
117 
118  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
119  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
120  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
121  return false;
122  }
123  continue;
124  }
125 
126  // Check same base pointer. Be careful of subregisters, which can occur
127  // with vectors of pointers.
128  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
129  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
130  return false;
131  }
132  }
133  return true;
134  }
135 
136  bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
137  for (unsigned i = 0; i < NumAddresses; ++i) {
138  const MachineOperand *AddrOp = AddrReg[i];
139  // Immediates are always OK.
140  if (AddrOp->isImm())
141  continue;
142 
143  // Don't try to merge addresses that aren't either immediates or registers.
144  // TODO: Should be possible to merge FrameIndexes and maybe some other
145  // non-register
146  if (!AddrOp->isReg())
147  return false;
148 
149  // TODO: We should be able to merge physical reg addresses.
150  if (AddrOp->getReg().isPhysical())
151  return false;
152 
153  // If an address has only one use then there will be on other
154  // instructions with the same address, so we can't merge this one.
155  if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
156  return false;
157  }
158  return true;
159  }
160 
161  void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
162  const GCNSubtarget &STM);
163  };
164 
165  struct BaseRegisters {
166  Register LoReg;
167  Register HiReg;
168 
169  unsigned LoSubReg = 0;
170  unsigned HiSubReg = 0;
171  };
172 
173  struct MemAddress {
174  BaseRegisters Base;
175  int64_t Offset = 0;
176  };
177 
178  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
179 
180 private:
181  const GCNSubtarget *STM = nullptr;
182  const SIInstrInfo *TII = nullptr;
183  const SIRegisterInfo *TRI = nullptr;
184  MachineRegisterInfo *MRI = nullptr;
185  AliasAnalysis *AA = nullptr;
186  bool OptimizeAgain;
187 
188  static bool dmasksCanBeCombined(const CombineInfo &CI,
189  const SIInstrInfo &TII,
190  const CombineInfo &Paired);
191  static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
192  CombineInfo &Paired, bool Modify = false);
193  static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
194  const CombineInfo &Paired);
195  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
196  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
197  const CombineInfo &Paired);
198  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
199  const CombineInfo &Paired);
200  const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
201 
202  bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
203  SmallVectorImpl<MachineInstr *> &InstsToMove);
204 
205  unsigned read2Opcode(unsigned EltSize) const;
206  unsigned read2ST64Opcode(unsigned EltSize) const;
207  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
208  CombineInfo &Paired,
209  const SmallVectorImpl<MachineInstr *> &InstsToMove);
210 
211  unsigned write2Opcode(unsigned EltSize) const;
212  unsigned write2ST64Opcode(unsigned EltSize) const;
214  mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
215  const SmallVectorImpl<MachineInstr *> &InstsToMove);
217  mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
218  const SmallVectorImpl<MachineInstr *> &InstsToMove);
220  mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
221  const SmallVectorImpl<MachineInstr *> &InstsToMove);
223  mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
224  const SmallVectorImpl<MachineInstr *> &InstsToMove);
226  mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
227  const SmallVectorImpl<MachineInstr *> &InstsToMove);
229  mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
230  const SmallVectorImpl<MachineInstr *> &InstsToMove);
232  mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
233  const SmallVectorImpl<MachineInstr *> &InstsToMove);
234 
235  void updateBaseAndOffset(MachineInstr &I, Register NewBase,
236  int32_t NewOffset) const;
237  Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
238  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
239  Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
240  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
241  /// Promotes constant offset to the immediate by adjusting the base. It
242  /// tries to use a base from the nearby instructions that allows it to have
243  /// a 13bit constant offset which gets promoted to the immediate.
244  bool promoteConstantOffsetToImm(MachineInstr &CI,
245  MemInfoMap &Visited,
246  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
247  void addInstToMergeableList(const CombineInfo &CI,
248  std::list<std::list<CombineInfo> > &MergeableInsts) const;
249 
250  std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
252  MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
253  std::list<std::list<CombineInfo>> &MergeableInsts) const;
254 
255 public:
256  static char ID;
257 
258  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
260  }
261 
262  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
263  bool &OptimizeListAgain);
264  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
265 
266  bool runOnMachineFunction(MachineFunction &MF) override;
267 
268  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
269 
270  void getAnalysisUsage(AnalysisUsage &AU) const override {
271  AU.setPreservesCFG();
273 
275  }
276 
277  MachineFunctionProperties getRequiredProperties() const override {
280  }
281 };
282 
283 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
284  const unsigned Opc = MI.getOpcode();
285 
286  if (TII.isMUBUF(Opc)) {
287  // FIXME: Handle d16 correctly
288  return AMDGPU::getMUBUFElements(Opc);
289  }
290  if (TII.isMIMG(MI)) {
291  uint64_t DMaskImm =
292  TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
293  return countPopulation(DMaskImm);
294  }
295  if (TII.isMTBUF(Opc)) {
296  return AMDGPU::getMTBUFElements(Opc);
297  }
298 
299  switch (Opc) {
300  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
301  return 1;
302  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
303  return 2;
304  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
305  return 4;
306  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
307  return 8;
308  case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
309  case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
310  case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
311  case AMDGPU::DS_WRITE_B32_gfx9:
312  return 1;
313  case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH;
314  case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
315  case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH;
316  case AMDGPU::DS_WRITE_B64_gfx9:
317  return 2;
318  default:
319  return 0;
320  }
321 }
322 
323 /// Maps instruction opcode to enum InstClassEnum.
324 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
325  switch (Opc) {
326  default:
327  if (TII.isMUBUF(Opc)) {
328  switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
329  default:
330  return UNKNOWN;
331  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
332  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
333  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
334  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
335  return BUFFER_LOAD;
336  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
337  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
338  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
339  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
340  return BUFFER_STORE;
341  }
342  }
343  if (TII.isMIMG(Opc)) {
344  // Ignore instructions encoded without vaddr.
345  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
346  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
347  return UNKNOWN;
348  // Ignore BVH instructions
349  if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
350  return UNKNOWN;
351  // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
352  if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
353  TII.isGather4(Opc))
354  return UNKNOWN;
355  return MIMG;
356  }
357  if (TII.isMTBUF(Opc)) {
358  switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
359  default:
360  return UNKNOWN;
361  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
362  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
363  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
364  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
365  return TBUFFER_LOAD;
366  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
367  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
368  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
369  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
370  return TBUFFER_STORE;
371  }
372  }
373  return UNKNOWN;
374  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
375  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
376  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
377  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
378  return S_BUFFER_LOAD_IMM;
379  case AMDGPU::DS_READ_B32:
380  case AMDGPU::DS_READ_B32_gfx9:
381  case AMDGPU::DS_READ_B64:
382  case AMDGPU::DS_READ_B64_gfx9:
383  return DS_READ;
384  case AMDGPU::DS_WRITE_B32:
385  case AMDGPU::DS_WRITE_B32_gfx9:
386  case AMDGPU::DS_WRITE_B64:
387  case AMDGPU::DS_WRITE_B64_gfx9:
388  return DS_WRITE;
389  }
390 }
391 
392 /// Determines instruction subclass from opcode. Only instructions
393 /// of the same subclass can be merged together.
394 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
395  switch (Opc) {
396  default:
397  if (TII.isMUBUF(Opc))
398  return AMDGPU::getMUBUFBaseOpcode(Opc);
399  if (TII.isMIMG(Opc)) {
401  assert(Info);
402  return Info->BaseOpcode;
403  }
404  if (TII.isMTBUF(Opc))
405  return AMDGPU::getMTBUFBaseOpcode(Opc);
406  return -1;
407  case AMDGPU::DS_READ_B32:
408  case AMDGPU::DS_READ_B32_gfx9:
409  case AMDGPU::DS_READ_B64:
410  case AMDGPU::DS_READ_B64_gfx9:
411  case AMDGPU::DS_WRITE_B32:
412  case AMDGPU::DS_WRITE_B32_gfx9:
413  case AMDGPU::DS_WRITE_B64:
414  case AMDGPU::DS_WRITE_B64_gfx9:
415  return Opc;
416  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
417  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
418  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
419  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
420  return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
421  }
422 }
423 
424 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
425  AddressRegs Result;
426 
427  if (TII.isMUBUF(Opc)) {
428  if (AMDGPU::getMUBUFHasVAddr(Opc))
429  Result.VAddr = true;
430  if (AMDGPU::getMUBUFHasSrsrc(Opc))
431  Result.SRsrc = true;
433  Result.SOffset = true;
434 
435  return Result;
436  }
437 
438  if (TII.isMIMG(Opc)) {
439  int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
440  if (VAddr0Idx >= 0) {
441  int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
442  Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
443  } else {
444  Result.VAddr = true;
445  }
446  Result.SRsrc = true;
448  if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
449  Result.SSamp = true;
450 
451  return Result;
452  }
453  if (TII.isMTBUF(Opc)) {
454  if (AMDGPU::getMTBUFHasVAddr(Opc))
455  Result.VAddr = true;
456  if (AMDGPU::getMTBUFHasSrsrc(Opc))
457  Result.SRsrc = true;
459  Result.SOffset = true;
460 
461  return Result;
462  }
463 
464  switch (Opc) {
465  default:
466  return Result;
467  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
468  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
469  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
470  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
471  Result.SBase = true;
472  return Result;
473  case AMDGPU::DS_READ_B32:
474  case AMDGPU::DS_READ_B64:
475  case AMDGPU::DS_READ_B32_gfx9:
476  case AMDGPU::DS_READ_B64_gfx9:
477  case AMDGPU::DS_WRITE_B32:
478  case AMDGPU::DS_WRITE_B64:
479  case AMDGPU::DS_WRITE_B32_gfx9:
480  case AMDGPU::DS_WRITE_B64_gfx9:
481  Result.Addr = true;
482  return Result;
483  }
484 }
485 
486 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
487  const SIInstrInfo &TII,
488  const GCNSubtarget &STM) {
489  I = MI;
490  unsigned Opc = MI->getOpcode();
491  InstClass = getInstClass(Opc, TII);
492 
493  if (InstClass == UNKNOWN)
494  return;
495 
496  switch (InstClass) {
497  case DS_READ:
498  EltSize =
499  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
500  : 4;
501  break;
502  case DS_WRITE:
503  EltSize =
504  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
505  : 4;
506  break;
507  case S_BUFFER_LOAD_IMM:
508  EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
509  break;
510  default:
511  EltSize = 4;
512  break;
513  }
514 
515  if (InstClass == MIMG) {
516  DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
517  // Offset is not considered for MIMG instructions.
518  Offset = 0;
519  } else {
520  int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
521  Offset = I->getOperand(OffsetIdx).getImm();
522  }
523 
524  if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
525  Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
526 
527  Width = getOpcodeWidth(*I, TII);
528 
529  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
530  Offset &= 0xffff;
531  } else if (InstClass != MIMG) {
532  CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
533  }
534 
535  AddressRegs Regs = getRegs(Opc, TII);
536 
537  NumAddresses = 0;
538  for (unsigned J = 0; J < Regs.NumVAddrs; J++)
539  AddrIdx[NumAddresses++] =
540  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
541  if (Regs.Addr)
542  AddrIdx[NumAddresses++] =
543  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
544  if (Regs.SBase)
545  AddrIdx[NumAddresses++] =
546  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
547  if (Regs.SRsrc)
548  AddrIdx[NumAddresses++] =
549  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
550  if (Regs.SOffset)
551  AddrIdx[NumAddresses++] =
552  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
553  if (Regs.VAddr)
554  AddrIdx[NumAddresses++] =
555  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
556  if (Regs.SSamp)
557  AddrIdx[NumAddresses++] =
558  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
559  assert(NumAddresses <= MaxAddressRegs);
560 
561  for (unsigned J = 0; J < NumAddresses; J++)
562  AddrReg[J] = &I->getOperand(AddrIdx[J]);
563 }
564 
565 } // end anonymous namespace.
566 
567 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
568  "SI Load Store Optimizer", false, false)
572 
573 char SILoadStoreOptimizer::ID = 0;
574 
575 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
576 
578  return new SILoadStoreOptimizer();
579 }
580 
582  ArrayRef<MachineInstr *> InstsToMove) {
583  MachineBasicBlock *MBB = I->getParent();
584  ++I;
585  for (MachineInstr *MI : InstsToMove) {
586  MI->removeFromParent();
587  MBB->insert(I, MI);
588  }
589 }
590 
591 static void addDefsUsesToList(const MachineInstr &MI,
592  DenseSet<Register> &RegDefs,
593  DenseSet<Register> &PhysRegUses) {
594  for (const MachineOperand &Op : MI.operands()) {
595  if (Op.isReg()) {
596  if (Op.isDef())
597  RegDefs.insert(Op.getReg());
598  else if (Op.readsReg() && Op.getReg().isPhysical())
599  PhysRegUses.insert(Op.getReg());
600  }
601  }
602 }
603 
606  AliasAnalysis *AA) {
607  // RAW or WAR - cannot reorder
608  // WAW - cannot reorder
609  // RAR - safe to reorder
610  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
611 }
612 
613 // Add MI and its defs to the lists if MI reads one of the defs that are
614 // already in the list. Returns true in that case.
616  DenseSet<Register> &PhysRegUses,
618  for (MachineOperand &Use : MI.operands()) {
619  // If one of the defs is read, then there is a use of Def between I and the
620  // instruction that I will potentially be merged with. We will need to move
621  // this instruction after the merged instructions.
622  //
623  // Similarly, if there is a def which is read by an instruction that is to
624  // be moved for merging, then we need to move the def-instruction as well.
625  // This can only happen for physical registers such as M0; virtual
626  // registers are in SSA form.
627  if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
628  (Use.isDef() && RegDefs.count(Use.getReg())) ||
629  (Use.isDef() && Use.getReg().isPhysical() &&
630  PhysRegUses.count(Use.getReg())))) {
631  Insts.push_back(&MI);
632  addDefsUsesToList(MI, RegDefs, PhysRegUses);
633  return true;
634  }
635  }
636 
637  return false;
638 }
639 
641  ArrayRef<MachineInstr *> InstsToMove,
642  AliasAnalysis *AA) {
643  assert(MemOp.mayLoadOrStore());
644 
645  for (MachineInstr *InstToMove : InstsToMove) {
646  if (!InstToMove->mayLoadOrStore())
647  continue;
648  if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
649  return false;
650  }
651  return true;
652 }
653 
654 // This function assumes that \p A and \p B have are identical except for
655 // size and offset, and they reference adjacent memory.
657  const MachineMemOperand *A,
658  const MachineMemOperand *B) {
659  unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
660  unsigned Size = A->getSize() + B->getSize();
661  // This function adds the offset parameter to the existing offset for A,
662  // so we pass 0 here as the offset and then manually set it to the correct
663  // value after the call.
665  MMO->setOffset(MinOffset);
666  return MMO;
667 }
668 
669 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
670  const SIInstrInfo &TII,
671  const CombineInfo &Paired) {
672  assert(CI.InstClass == MIMG);
673 
674  // Ignore instructions with tfe/lwe set.
675  const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
676  const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
677 
678  if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
679  return false;
680 
681  // Check other optional immediate operands for equality.
682  unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
683  AMDGPU::OpName::unorm, AMDGPU::OpName::da,
684  AMDGPU::OpName::r128, AMDGPU::OpName::a16};
685 
686  for (auto op : OperandsToMatch) {
687  int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
688  if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
689  return false;
690  if (Idx != -1 &&
691  CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
692  return false;
693  }
694 
695  // Check DMask for overlaps.
696  unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
697  unsigned MinMask = std::min(CI.DMask, Paired.DMask);
698 
699  unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
700  if ((1u << AllowedBitsForMin) <= MinMask)
701  return false;
702 
703  return true;
704 }
705 
706 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
707  unsigned ComponentCount,
708  const GCNSubtarget &STI) {
709  if (ComponentCount > 4)
710  return 0;
711 
712  const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
713  llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
714  if (!OldFormatInfo)
715  return 0;
716 
717  const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
719  ComponentCount,
720  OldFormatInfo->NumFormat, STI);
721 
722  if (!NewFormatInfo)
723  return 0;
724 
725  assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
726  NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
727 
728  return NewFormatInfo->Format;
729 }
730 
731 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
732 // highest power of two. Note that the result is well defined for all inputs
733 // including corner cases like:
734 // - if Lo == Hi, return that value
735 // - if Lo == 0, return 0 (even though the "- 1" below underflows
736 // - if Lo > Hi, return 0 (as if the range wrapped around)
738  return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
739 }
740 
741 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
742  const GCNSubtarget &STI,
743  CombineInfo &Paired,
744  bool Modify) {
745  assert(CI.InstClass != MIMG);
746 
747  // XXX - Would the same offset be OK? Is there any reason this would happen or
748  // be useful?
749  if (CI.Offset == Paired.Offset)
750  return false;
751 
752  // This won't be valid if the offset isn't aligned.
753  if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
754  return false;
755 
756  if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
757 
758  const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
759  llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
760  if (!Info0)
761  return false;
762  const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
763  llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
764  if (!Info1)
765  return false;
766 
767  if (Info0->BitsPerComp != Info1->BitsPerComp ||
768  Info0->NumFormat != Info1->NumFormat)
769  return false;
770 
771  // TODO: Should be possible to support more formats, but if format loads
772  // are not dword-aligned, the merged load might not be valid.
773  if (Info0->BitsPerComp != 32)
774  return false;
775 
776  if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
777  return false;
778  }
779 
780  uint32_t EltOffset0 = CI.Offset / CI.EltSize;
781  uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
782  CI.UseST64 = false;
783  CI.BaseOff = 0;
784 
785  // Handle all non-DS instructions.
786  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
787  return (EltOffset0 + CI.Width == EltOffset1 ||
788  EltOffset1 + Paired.Width == EltOffset0) &&
789  CI.CPol == Paired.CPol &&
790  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
791  }
792 
793  // If the offset in elements doesn't fit in 8-bits, we might be able to use
794  // the stride 64 versions.
795  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
796  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
797  if (Modify) {
798  CI.Offset = EltOffset0 / 64;
799  Paired.Offset = EltOffset1 / 64;
800  CI.UseST64 = true;
801  }
802  return true;
803  }
804 
805  // Check if the new offsets fit in the reduced 8-bit range.
806  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
807  if (Modify) {
808  CI.Offset = EltOffset0;
809  Paired.Offset = EltOffset1;
810  }
811  return true;
812  }
813 
814  // Try to shift base address to decrease offsets.
815  uint32_t Min = std::min(EltOffset0, EltOffset1);
816  uint32_t Max = std::max(EltOffset0, EltOffset1);
817 
818  const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
819  if (((Max - Min) & ~Mask) == 0) {
820  if (Modify) {
821  // From the range of values we could use for BaseOff, choose the one that
822  // is aligned to the highest power of two, to maximise the chance that
823  // the same offset can be reused for other load/store pairs.
824  uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
825  // Copy the low bits of the offsets, so that when we adjust them by
826  // subtracting BaseOff they will be multiples of 64.
827  BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
828  CI.BaseOff = BaseOff * CI.EltSize;
829  CI.Offset = (EltOffset0 - BaseOff) / 64;
830  Paired.Offset = (EltOffset1 - BaseOff) / 64;
831  CI.UseST64 = true;
832  }
833  return true;
834  }
835 
836  if (isUInt<8>(Max - Min)) {
837  if (Modify) {
838  // From the range of values we could use for BaseOff, choose the one that
839  // is aligned to the highest power of two, to maximise the chance that
840  // the same offset can be reused for other load/store pairs.
841  uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
842  CI.BaseOff = BaseOff * CI.EltSize;
843  CI.Offset = EltOffset0 - BaseOff;
844  Paired.Offset = EltOffset1 - BaseOff;
845  }
846  return true;
847  }
848 
849  return false;
850 }
851 
852 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
853  const CombineInfo &CI,
854  const CombineInfo &Paired) {
855  const unsigned Width = (CI.Width + Paired.Width);
856  switch (CI.InstClass) {
857  default:
858  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
859  case S_BUFFER_LOAD_IMM:
860  switch (Width) {
861  default:
862  return false;
863  case 2:
864  case 4:
865  case 8:
866  return true;
867  }
868  }
869 }
870 
871 const TargetRegisterClass *
872 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
873  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
874  return TRI->getRegClassForReg(*MRI, Dst->getReg());
875  }
876  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
877  return TRI->getRegClassForReg(*MRI, Src->getReg());
878  }
879  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
880  return TRI->getRegClassForReg(*MRI, Src->getReg());
881  }
882  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
883  return TRI->getRegClassForReg(*MRI, Dst->getReg());
884  }
885  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
886  return TRI->getRegClassForReg(*MRI, Src->getReg());
887  }
888  return nullptr;
889 }
890 
891 /// This function assumes that CI comes before Paired in a basic block.
892 bool SILoadStoreOptimizer::checkAndPrepareMerge(
893  CombineInfo &CI, CombineInfo &Paired,
894  SmallVectorImpl<MachineInstr *> &InstsToMove) {
895 
896  // Check both offsets (or masks for MIMG) can be combined and fit in the
897  // reduced range.
898  if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
899  return false;
900 
901  if (CI.InstClass != MIMG &&
902  (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
903  return false;
904 
905  const unsigned Opc = CI.I->getOpcode();
906  const InstClassEnum InstClass = getInstClass(Opc, *TII);
907 
908  if (InstClass == UNKNOWN) {
909  return false;
910  }
911  const unsigned InstSubclass = getInstSubclass(Opc, *TII);
912 
913  // Do not merge VMEM buffer instructions with "swizzled" bit set.
914  int Swizzled =
915  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
916  if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
917  return false;
918 
919  DenseSet<Register> RegDefsToMove;
920  DenseSet<Register> PhysRegUsesToMove;
921  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
922 
923  const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
924  bool IsAGPR = TRI->hasAGPRs(DataRC);
925 
926  MachineBasicBlock::iterator E = std::next(Paired.I);
927  MachineBasicBlock::iterator MBBI = std::next(CI.I);
928  MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
929  for (; MBBI != E; ++MBBI) {
930 
931  if (MBBI == MBBE) {
932  // CombineInfo::Order is a hint on the instruction ordering within the
933  // basic block. This hint suggests that CI precedes Paired, which is
934  // true most of the time. However, moveInstsAfter() processing a
935  // previous list may have changed this order in a situation when it
936  // moves an instruction which exists in some other merge list.
937  // In this case it must be dependent.
938  return false;
939  }
940 
941  if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
942  (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
943  // This is not a matching instruction, but we can keep looking as
944  // long as one of these conditions are met:
945  // 1. It is safe to move I down past MBBI.
946  // 2. It is safe to move MBBI down past the instruction that I will
947  // be merged into.
948 
949  if (MBBI->hasUnmodeledSideEffects()) {
950  // We can't re-order this instruction with respect to other memory
951  // operations, so we fail both conditions mentioned above.
952  return false;
953  }
954 
955  if (MBBI->mayLoadOrStore() &&
956  (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
957  !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
958  // We fail condition #1, but we may still be able to satisfy condition
959  // #2. Add this instruction to the move list and then we will check
960  // if condition #2 holds once we have selected the matching instruction.
961  InstsToMove.push_back(&*MBBI);
962  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
963  continue;
964  }
965 
966  // When we match I with another DS instruction we will be moving I down
967  // to the location of the matched instruction any uses of I will need to
968  // be moved down as well.
969  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
970  InstsToMove);
971  continue;
972  }
973 
974  // Don't merge volatiles.
975  if (MBBI->hasOrderedMemoryRef())
976  return false;
977 
978  int Swizzled =
979  AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
980  if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
981  return false;
982 
983  // Handle a case like
984  // DS_WRITE_B32 addr, v, idx0
985  // w = DS_READ_B32 addr, idx0
986  // DS_WRITE_B32 addr, f(w), idx1
987  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
988  // merging of the two writes.
989  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
990  InstsToMove))
991  continue;
992 
993  if (&*MBBI == &*Paired.I) {
994  if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
995  return false;
996  // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
997  // operands. However we are reporting that ds_write2 shall have
998  // only VGPR data so that machine copy propagation does not
999  // create an illegal instruction with a VGPR and AGPR sources.
1000  // Consequenctially if we create such instruction the verifier
1001  // will complain.
1002  if (IsAGPR && CI.InstClass == DS_WRITE)
1003  return false;
1004 
1005  // We need to go through the list of instructions that we plan to
1006  // move and make sure they are all safe to move down past the merged
1007  // instruction.
1008  if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
1009 
1010  // Call offsetsCanBeCombined with modify = true so that the offsets are
1011  // correct for the new instruction. This should return true, because
1012  // this function should only be called on CombineInfo objects that
1013  // have already been confirmed to be mergeable.
1014  if (CI.InstClass != MIMG)
1015  offsetsCanBeCombined(CI, *STM, Paired, true);
1016  return true;
1017  }
1018  return false;
1019  }
1020 
1021  // We've found a load/store that we couldn't merge for some reason.
1022  // We could potentially keep looking, but we'd need to make sure that
1023  // it was safe to move I and also all the instruction in InstsToMove
1024  // down past this instruction.
1025  // check if we can move I across MBBI and if we can move all I's users
1026  if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
1027  !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
1028  break;
1029  }
1030  return false;
1031 }
1032 
1033 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1034  if (STM->ldsRequiresM0Init())
1035  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1036  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1037 }
1038 
1039 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1040  if (STM->ldsRequiresM0Init())
1041  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1042 
1043  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1044  : AMDGPU::DS_READ2ST64_B64_gfx9;
1045 }
1046 
1048 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1049  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1050  MachineBasicBlock *MBB = CI.I->getParent();
1051 
1052  // Be careful, since the addresses could be subregisters themselves in weird
1053  // cases, like vectors of pointers.
1054  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1055 
1056  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1057  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1058 
1059  unsigned NewOffset0 = CI.Offset;
1060  unsigned NewOffset1 = Paired.Offset;
1061  unsigned Opc =
1062  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1063 
1064  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1065  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1066 
1067  if (NewOffset0 > NewOffset1) {
1068  // Canonicalize the merged instruction so the smaller offset comes first.
1069  std::swap(NewOffset0, NewOffset1);
1070  std::swap(SubRegIdx0, SubRegIdx1);
1071  }
1072 
1073  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1074  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1075 
1076  const MCInstrDesc &Read2Desc = TII->get(Opc);
1077 
1078  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1079  Register DestReg = MRI->createVirtualRegister(SuperRC);
1080 
1081  DebugLoc DL = CI.I->getDebugLoc();
1082 
1083  Register BaseReg = AddrReg->getReg();
1084  unsigned BaseSubReg = AddrReg->getSubReg();
1085  unsigned BaseRegFlags = 0;
1086  if (CI.BaseOff) {
1087  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1088  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1089  .addImm(CI.BaseOff);
1090 
1091  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1092  BaseRegFlags = RegState::Kill;
1093 
1094  TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1095  .addReg(ImmReg)
1096  .addReg(AddrReg->getReg(), 0, BaseSubReg)
1097  .addImm(0); // clamp bit
1098  BaseSubReg = 0;
1099  }
1100 
1101  MachineInstrBuilder Read2 =
1102  BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
1103  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1104  .addImm(NewOffset0) // offset0
1105  .addImm(NewOffset1) // offset1
1106  .addImm(0) // gds
1107  .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1108 
1109  (void)Read2;
1110 
1111  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1112 
1113  // Copy to the old destination registers.
1114  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1115  .add(*Dest0) // Copy to same destination including flags and sub reg.
1116  .addReg(DestReg, 0, SubRegIdx0);
1117  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1118  .add(*Dest1)
1119  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1120 
1121  moveInstsAfter(Copy1, InstsToMove);
1122 
1123  CI.I->eraseFromParent();
1124  Paired.I->eraseFromParent();
1125 
1126  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1127  return Read2;
1128 }
1129 
1130 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1131  if (STM->ldsRequiresM0Init())
1132  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1133  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1134  : AMDGPU::DS_WRITE2_B64_gfx9;
1135 }
1136 
1137 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1138  if (STM->ldsRequiresM0Init())
1139  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1140  : AMDGPU::DS_WRITE2ST64_B64;
1141 
1142  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1143  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1144 }
1145 
1147 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
1148  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1149  MachineBasicBlock *MBB = CI.I->getParent();
1150 
1151  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1152  // sure we preserve the subregister index and any register flags set on them.
1153  const MachineOperand *AddrReg =
1154  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1155  const MachineOperand *Data0 =
1156  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1157  const MachineOperand *Data1 =
1158  TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1159 
1160  unsigned NewOffset0 = CI.Offset;
1161  unsigned NewOffset1 = Paired.Offset;
1162  unsigned Opc =
1163  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1164 
1165  if (NewOffset0 > NewOffset1) {
1166  // Canonicalize the merged instruction so the smaller offset comes first.
1167  std::swap(NewOffset0, NewOffset1);
1168  std::swap(Data0, Data1);
1169  }
1170 
1171  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1172  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1173 
1174  const MCInstrDesc &Write2Desc = TII->get(Opc);
1175  DebugLoc DL = CI.I->getDebugLoc();
1176 
1177  Register BaseReg = AddrReg->getReg();
1178  unsigned BaseSubReg = AddrReg->getSubReg();
1179  unsigned BaseRegFlags = 0;
1180  if (CI.BaseOff) {
1181  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1182  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1183  .addImm(CI.BaseOff);
1184 
1185  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1186  BaseRegFlags = RegState::Kill;
1187 
1188  TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1189  .addReg(ImmReg)
1190  .addReg(AddrReg->getReg(), 0, BaseSubReg)
1191  .addImm(0); // clamp bit
1192  BaseSubReg = 0;
1193  }
1194 
1195  MachineInstrBuilder Write2 =
1196  BuildMI(*MBB, Paired.I, DL, Write2Desc)
1197  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1198  .add(*Data0) // data0
1199  .add(*Data1) // data1
1200  .addImm(NewOffset0) // offset0
1201  .addImm(NewOffset1) // offset1
1202  .addImm(0) // gds
1203  .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1204 
1205  moveInstsAfter(Write2, InstsToMove);
1206 
1207  CI.I->eraseFromParent();
1208  Paired.I->eraseFromParent();
1209 
1210  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1211  return Write2;
1212 }
1213 
1215 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1216  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1217  MachineBasicBlock *MBB = CI.I->getParent();
1218  DebugLoc DL = CI.I->getDebugLoc();
1219  const unsigned Opcode = getNewOpcode(CI, Paired);
1220 
1221  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1222 
1223  Register DestReg = MRI->createVirtualRegister(SuperRC);
1224  unsigned MergedDMask = CI.DMask | Paired.DMask;
1225  unsigned DMaskIdx =
1226  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1227 
1228  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1229  for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1230  if (I == DMaskIdx)
1231  MIB.addImm(MergedDMask);
1232  else
1233  MIB.add((*CI.I).getOperand(I));
1234  }
1235 
1236  // It shouldn't be possible to get this far if the two instructions
1237  // don't have a single memoperand, because MachineInstr::mayAlias()
1238  // will return true if this is the case.
1239  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1240 
1241  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1242  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1243 
1244  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1245 
1246  unsigned SubRegIdx0, SubRegIdx1;
1247  std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1248 
1249  // Copy to the old destination registers.
1250  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1251  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1252  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1253 
1254  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1255  .add(*Dest0) // Copy to same destination including flags and sub reg.
1256  .addReg(DestReg, 0, SubRegIdx0);
1257  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1258  .add(*Dest1)
1259  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1260 
1261  moveInstsAfter(Copy1, InstsToMove);
1262 
1263  CI.I->eraseFromParent();
1264  Paired.I->eraseFromParent();
1265  return New;
1266 }
1267 
1268 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1269  CombineInfo &CI, CombineInfo &Paired,
1270  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1271  MachineBasicBlock *MBB = CI.I->getParent();
1272  DebugLoc DL = CI.I->getDebugLoc();
1273  const unsigned Opcode = getNewOpcode(CI, Paired);
1274 
1275  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1276 
1277  Register DestReg = MRI->createVirtualRegister(SuperRC);
1278  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1279 
1280  // It shouldn't be possible to get this far if the two instructions
1281  // don't have a single memoperand, because MachineInstr::mayAlias()
1282  // will return true if this is the case.
1283  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1284 
1285  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1286  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1287 
1288  MachineInstr *New =
1289  BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
1290  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1291  .addImm(MergedOffset) // offset
1292  .addImm(CI.CPol) // cpol
1294 
1295  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1296  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1297  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1298 
1299  // Copy to the old destination registers.
1300  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1301  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1302  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1303 
1304  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1305  .add(*Dest0) // Copy to same destination including flags and sub reg.
1306  .addReg(DestReg, 0, SubRegIdx0);
1307  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1308  .add(*Dest1)
1309  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1310 
1311  moveInstsAfter(Copy1, InstsToMove);
1312 
1313  CI.I->eraseFromParent();
1314  Paired.I->eraseFromParent();
1315  return New;
1316 }
1317 
1318 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1319  CombineInfo &CI, CombineInfo &Paired,
1320  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1321  MachineBasicBlock *MBB = CI.I->getParent();
1322  DebugLoc DL = CI.I->getDebugLoc();
1323 
1324  const unsigned Opcode = getNewOpcode(CI, Paired);
1325 
1326  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1327 
1328  // Copy to the new source register.
1329  Register DestReg = MRI->createVirtualRegister(SuperRC);
1330  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1331 
1332  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1333 
1334  AddressRegs Regs = getRegs(Opcode, *TII);
1335 
1336  if (Regs.VAddr)
1337  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1338 
1339  // It shouldn't be possible to get this far if the two instructions
1340  // don't have a single memoperand, because MachineInstr::mayAlias()
1341  // will return true if this is the case.
1342  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1343 
1344  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1345  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1346 
1347  MachineInstr *New =
1348  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1349  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1350  .addImm(MergedOffset) // offset
1351  .addImm(CI.CPol) // cpol
1352  .addImm(0) // tfe
1353  .addImm(0) // swz
1354  .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1355 
1356  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1357  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1358  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1359 
1360  // Copy to the old destination registers.
1361  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1362  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1363  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1364 
1365  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1366  .add(*Dest0) // Copy to same destination including flags and sub reg.
1367  .addReg(DestReg, 0, SubRegIdx0);
1368  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1369  .add(*Dest1)
1370  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1371 
1372  moveInstsAfter(Copy1, InstsToMove);
1373 
1374  CI.I->eraseFromParent();
1375  Paired.I->eraseFromParent();
1376  return New;
1377 }
1378 
1379 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1380  CombineInfo &CI, CombineInfo &Paired,
1381  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1382  MachineBasicBlock *MBB = CI.I->getParent();
1383  DebugLoc DL = CI.I->getDebugLoc();
1384 
1385  const unsigned Opcode = getNewOpcode(CI, Paired);
1386 
1387  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1388 
1389  // Copy to the new source register.
1390  Register DestReg = MRI->createVirtualRegister(SuperRC);
1391  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1392 
1393  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1394 
1395  AddressRegs Regs = getRegs(Opcode, *TII);
1396 
1397  if (Regs.VAddr)
1398  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1399 
1400  unsigned JoinedFormat =
1401  getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1402 
1403  // It shouldn't be possible to get this far if the two instructions
1404  // don't have a single memoperand, because MachineInstr::mayAlias()
1405  // will return true if this is the case.
1406  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1407 
1408  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1409  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1410 
1411  MachineInstr *New =
1412  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1413  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1414  .addImm(MergedOffset) // offset
1415  .addImm(JoinedFormat) // format
1416  .addImm(CI.CPol) // cpol
1417  .addImm(0) // tfe
1418  .addImm(0) // swz
1419  .addMemOperand(
1420  combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1421 
1422  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1423  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1424  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1425 
1426  // Copy to the old destination registers.
1427  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1428  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1429  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1430 
1431  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1432  .add(*Dest0) // Copy to same destination including flags and sub reg.
1433  .addReg(DestReg, 0, SubRegIdx0);
1434  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1435  .add(*Dest1)
1436  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1437 
1438  moveInstsAfter(Copy1, InstsToMove);
1439 
1440  CI.I->eraseFromParent();
1441  Paired.I->eraseFromParent();
1442  return New;
1443 }
1444 
1445 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1446  CombineInfo &CI, CombineInfo &Paired,
1447  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1448  MachineBasicBlock *MBB = CI.I->getParent();
1449  DebugLoc DL = CI.I->getDebugLoc();
1450 
1451  const unsigned Opcode = getNewOpcode(CI, Paired);
1452 
1453  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1454  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1455  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1456 
1457  // Copy to the new source register.
1458  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1459  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1460 
1461  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1462  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1463 
1464  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1465  .add(*Src0)
1466  .addImm(SubRegIdx0)
1467  .add(*Src1)
1468  .addImm(SubRegIdx1);
1469 
1470  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1471  .addReg(SrcReg, RegState::Kill);
1472 
1473  AddressRegs Regs = getRegs(Opcode, *TII);
1474 
1475  if (Regs.VAddr)
1476  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1477 
1478  unsigned JoinedFormat =
1479  getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1480 
1481  // It shouldn't be possible to get this far if the two instructions
1482  // don't have a single memoperand, because MachineInstr::mayAlias()
1483  // will return true if this is the case.
1484  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1485 
1486  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1487  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1488 
1489  MachineInstr *New =
1490  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1491  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1492  .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1493  .addImm(JoinedFormat) // format
1494  .addImm(CI.CPol) // cpol
1495  .addImm(0) // tfe
1496  .addImm(0) // swz
1497  .addMemOperand(
1498  combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1499 
1500  moveInstsAfter(MIB, InstsToMove);
1501 
1502  CI.I->eraseFromParent();
1503  Paired.I->eraseFromParent();
1504  return New;
1505 }
1506 
1507 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1508  const CombineInfo &Paired) {
1509  const unsigned Width = CI.Width + Paired.Width;
1510 
1511  switch (CI.InstClass) {
1512  default:
1513  assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1514  // FIXME: Handle d16 correctly
1515  return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1516  Width);
1517  case TBUFFER_LOAD:
1518  case TBUFFER_STORE:
1519  return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1520  Width);
1521 
1522  case UNKNOWN:
1523  llvm_unreachable("Unknown instruction class");
1524  case S_BUFFER_LOAD_IMM:
1525  switch (Width) {
1526  default:
1527  return 0;
1528  case 2:
1529  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1530  case 4:
1531  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1532  case 8:
1533  return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1534  }
1535  case MIMG:
1536  assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1537  "No overlaps");
1538  return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1539  }
1540 }
1541 
1542 std::pair<unsigned, unsigned>
1543 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1544  const CombineInfo &Paired) {
1545 
1546  assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero");
1547 
1548  bool ReverseOrder;
1549  if (CI.InstClass == MIMG) {
1550  assert(
1551  (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1552  "No overlaps");
1553  ReverseOrder = CI.DMask > Paired.DMask;
1554  } else
1555  ReverseOrder = CI.Offset > Paired.Offset;
1556 
1557  unsigned Idx0;
1558  unsigned Idx1;
1559 
1560  if (CI.Width + Paired.Width > 4) {
1561  assert(CI.Width == 4 && Paired.Width == 4);
1562 
1563  if (ReverseOrder) {
1564  Idx1 = AMDGPU::sub0_sub1_sub2_sub3;
1565  Idx0 = AMDGPU::sub4_sub5_sub6_sub7;
1566  } else {
1567  Idx0 = AMDGPU::sub0_sub1_sub2_sub3;
1568  Idx1 = AMDGPU::sub4_sub5_sub6_sub7;
1569  }
1570  } else {
1571  static const unsigned Idxs[4][4] = {
1572  {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1573  {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1574  {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1575  {AMDGPU::sub3, 0, 0, 0},
1576  };
1577 
1578  assert(CI.Width >= 1 && CI.Width <= 3);
1579  assert(Paired.Width >= 1 && Paired.Width <= 3);
1580 
1581  if (ReverseOrder) {
1582  Idx1 = Idxs[0][Paired.Width - 1];
1583  Idx0 = Idxs[Paired.Width][CI.Width - 1];
1584  } else {
1585  Idx0 = Idxs[0][CI.Width - 1];
1586  Idx1 = Idxs[CI.Width][Paired.Width - 1];
1587  }
1588  }
1589 
1590  return std::make_pair(Idx0, Idx1);
1591 }
1592 
1593 const TargetRegisterClass *
1594 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1595  const CombineInfo &Paired) {
1596  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1597  switch (CI.Width + Paired.Width) {
1598  default:
1599  return nullptr;
1600  case 2:
1601  return &AMDGPU::SReg_64_XEXECRegClass;
1602  case 4:
1603  return &AMDGPU::SGPR_128RegClass;
1604  case 8:
1605  return &AMDGPU::SGPR_256RegClass;
1606  case 16:
1607  return &AMDGPU::SGPR_512RegClass;
1608  }
1609  }
1610 
1611  unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1612  return TRI->isAGPRClass(getDataRegClass(*CI.I))
1613  ? TRI->getAGPRClassForBitWidth(BitWidth)
1614  : TRI->getVGPRClassForBitWidth(BitWidth);
1615 }
1616 
1617 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1618  CombineInfo &CI, CombineInfo &Paired,
1619  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1620  MachineBasicBlock *MBB = CI.I->getParent();
1621  DebugLoc DL = CI.I->getDebugLoc();
1622 
1623  const unsigned Opcode = getNewOpcode(CI, Paired);
1624 
1625  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1626  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1627  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1628 
1629  // Copy to the new source register.
1630  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1631  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1632 
1633  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1634  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1635 
1636  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1637  .add(*Src0)
1638  .addImm(SubRegIdx0)
1639  .add(*Src1)
1640  .addImm(SubRegIdx1);
1641 
1642  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1643  .addReg(SrcReg, RegState::Kill);
1644 
1645  AddressRegs Regs = getRegs(Opcode, *TII);
1646 
1647  if (Regs.VAddr)
1648  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1649 
1650 
1651  // It shouldn't be possible to get this far if the two instructions
1652  // don't have a single memoperand, because MachineInstr::mayAlias()
1653  // will return true if this is the case.
1654  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1655 
1656  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1657  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1658 
1659  MachineInstr *New =
1660  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1661  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1662  .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1663  .addImm(CI.CPol) // cpol
1664  .addImm(0) // tfe
1665  .addImm(0) // swz
1666  .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1667 
1668  moveInstsAfter(MIB, InstsToMove);
1669 
1670  CI.I->eraseFromParent();
1671  Paired.I->eraseFromParent();
1672  return New;
1673 }
1674 
1676 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1677  APInt V(32, Val, true);
1678  if (TII->isInlineConstant(V))
1679  return MachineOperand::CreateImm(Val);
1680 
1681  Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1682  MachineInstr *Mov =
1683  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1684  TII->get(AMDGPU::S_MOV_B32), Reg)
1685  .addImm(Val);
1686  (void)Mov;
1687  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1688  return MachineOperand::CreateReg(Reg, false);
1689 }
1690 
1691 // Compute base address using Addr and return the final register.
1692 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1693  const MemAddress &Addr) const {
1694  MachineBasicBlock *MBB = MI.getParent();
1695  MachineBasicBlock::iterator MBBI = MI.getIterator();
1696  DebugLoc DL = MI.getDebugLoc();
1697 
1698  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1699  Addr.Base.LoSubReg) &&
1700  "Expected 32-bit Base-Register-Low!!");
1701 
1702  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1703  Addr.Base.HiSubReg) &&
1704  "Expected 32-bit Base-Register-Hi!!");
1705 
1706  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1707  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1708  MachineOperand OffsetHi =
1709  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1710 
1711  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1712  Register CarryReg = MRI->createVirtualRegister(CarryRC);
1713  Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1714 
1715  Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1716  Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1717  MachineInstr *LoHalf =
1718  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1719  .addReg(CarryReg, RegState::Define)
1720  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1721  .add(OffsetLo)
1722  .addImm(0); // clamp bit
1723  (void)LoHalf;
1724  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1725 
1726  MachineInstr *HiHalf =
1727  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1728  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1729  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1730  .add(OffsetHi)
1731  .addReg(CarryReg, RegState::Kill)
1732  .addImm(0); // clamp bit
1733  (void)HiHalf;
1734  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1735 
1736  Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1737  MachineInstr *FullBase =
1738  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1739  .addReg(DestSub0)
1740  .addImm(AMDGPU::sub0)
1741  .addReg(DestSub1)
1742  .addImm(AMDGPU::sub1);
1743  (void)FullBase;
1744  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1745 
1746  return FullDestReg;
1747 }
1748 
1749 // Update base and offset with the NewBase and NewOffset in MI.
1750 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1751  Register NewBase,
1752  int32_t NewOffset) const {
1753  auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1754  Base->setReg(NewBase);
1755  Base->setIsKill(false);
1756  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1757 }
1758 
1760 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1761  if (Op.isImm())
1762  return Op.getImm();
1763 
1764  if (!Op.isReg())
1765  return None;
1766 
1767  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1768  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1769  !Def->getOperand(1).isImm())
1770  return None;
1771 
1772  return Def->getOperand(1).getImm();
1773 }
1774 
1775 // Analyze Base and extracts:
1776 // - 32bit base registers, subregisters
1777 // - 64bit constant offset
1778 // Expecting base computation as:
1779 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1780 // %LO:vgpr_32, %c:sreg_64_xexec =
1781 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1782 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1783 // %Base:vreg_64 =
1784 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1785 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1786  MemAddress &Addr) const {
1787  if (!Base.isReg())
1788  return;
1789 
1790  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1791  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1792  || Def->getNumOperands() != 5)
1793  return;
1794 
1795  MachineOperand BaseLo = Def->getOperand(1);
1796  MachineOperand BaseHi = Def->getOperand(3);
1797  if (!BaseLo.isReg() || !BaseHi.isReg())
1798  return;
1799 
1800  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1801  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1802 
1803  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1804  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1805  return;
1806 
1807  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1808  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1809 
1810  auto Offset0P = extractConstOffset(*Src0);
1811  if (Offset0P)
1812  BaseLo = *Src1;
1813  else {
1814  if (!(Offset0P = extractConstOffset(*Src1)))
1815  return;
1816  BaseLo = *Src0;
1817  }
1818 
1819  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1820  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1821 
1822  if (Src0->isImm())
1823  std::swap(Src0, Src1);
1824 
1825  if (!Src1->isImm())
1826  return;
1827 
1828  uint64_t Offset1 = Src1->getImm();
1829  BaseHi = *Src0;
1830 
1831  Addr.Base.LoReg = BaseLo.getReg();
1832  Addr.Base.HiReg = BaseHi.getReg();
1833  Addr.Base.LoSubReg = BaseLo.getSubReg();
1834  Addr.Base.HiSubReg = BaseHi.getSubReg();
1835  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1836 }
1837 
1838 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1839  MachineInstr &MI,
1840  MemInfoMap &Visited,
1841  SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1842 
1843  if (!(MI.mayLoad() ^ MI.mayStore()))
1844  return false;
1845 
1846  // TODO: Support flat and scratch.
1847  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1848  return false;
1849 
1850  if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1851  return false;
1852 
1853  if (AnchorList.count(&MI))
1854  return false;
1855 
1856  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1857 
1858  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1859  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1860  return false;
1861  }
1862 
1863  // Step1: Find the base-registers and a 64bit constant offset.
1864  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1865  MemAddress MAddr;
1866  if (Visited.find(&MI) == Visited.end()) {
1867  processBaseWithConstOffset(Base, MAddr);
1868  Visited[&MI] = MAddr;
1869  } else
1870  MAddr = Visited[&MI];
1871 
1872  if (MAddr.Offset == 0) {
1873  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1874  " constant offsets that can be promoted.\n";);
1875  return false;
1876  }
1877 
1878  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1879  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1880 
1881  // Step2: Traverse through MI's basic block and find an anchor(that has the
1882  // same base-registers) with the highest 13bit distance from MI's offset.
1883  // E.g. (64bit loads)
1884  // bb:
1885  // addr1 = &a + 4096; load1 = load(addr1, 0)
1886  // addr2 = &a + 6144; load2 = load(addr2, 0)
1887  // addr3 = &a + 8192; load3 = load(addr3, 0)
1888  // addr4 = &a + 10240; load4 = load(addr4, 0)
1889  // addr5 = &a + 12288; load5 = load(addr5, 0)
1890  //
1891  // Starting from the first load, the optimization will try to find a new base
1892  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1893  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1894  // as the new-base(anchor) because of the maximum distance which can
1895  // accomodate more intermediate bases presumeably.
1896  //
1897  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1898  // (&a + 8192) for load1, load2, load4.
1899  // addr = &a + 8192
1900  // load1 = load(addr, -4096)
1901  // load2 = load(addr, -2048)
1902  // load3 = load(addr, 0)
1903  // load4 = load(addr, 2048)
1904  // addr5 = &a + 12288; load5 = load(addr5, 0)
1905  //
1906  MachineInstr *AnchorInst = nullptr;
1907  MemAddress AnchorAddr;
1910 
1911  MachineBasicBlock *MBB = MI.getParent();
1913  MachineBasicBlock::iterator MBBI = MI.getIterator();
1914  ++MBBI;
1915  const SITargetLowering *TLI =
1916  static_cast<const SITargetLowering *>(STM->getTargetLowering());
1917 
1918  for ( ; MBBI != E; ++MBBI) {
1919  MachineInstr &MINext = *MBBI;
1920  // TODO: Support finding an anchor(with same base) from store addresses or
1921  // any other load addresses where the opcodes are different.
1922  if (MINext.getOpcode() != MI.getOpcode() ||
1923  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1924  continue;
1925 
1926  const MachineOperand &BaseNext =
1927  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1928  MemAddress MAddrNext;
1929  if (Visited.find(&MINext) == Visited.end()) {
1930  processBaseWithConstOffset(BaseNext, MAddrNext);
1931  Visited[&MINext] = MAddrNext;
1932  } else
1933  MAddrNext = Visited[&MINext];
1934 
1935  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1936  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1937  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1938  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1939  continue;
1940 
1941  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1942 
1943  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1945  AM.HasBaseReg = true;
1946  AM.BaseOffs = Dist;
1947  if (TLI->isLegalGlobalAddressingMode(AM) &&
1948  (uint32_t)std::abs(Dist) > MaxDist) {
1949  MaxDist = std::abs(Dist);
1950 
1951  AnchorAddr = MAddrNext;
1952  AnchorInst = &MINext;
1953  }
1954  }
1955 
1956  if (AnchorInst) {
1957  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1958  AnchorInst->dump());
1959  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1960  << AnchorAddr.Offset << "\n\n");
1961 
1962  // Instead of moving up, just re-compute anchor-instruction's base address.
1963  Register Base = computeBase(MI, AnchorAddr);
1964 
1965  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1966  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1967 
1968  for (auto P : InstsWCommonBase) {
1970  AM.HasBaseReg = true;
1971  AM.BaseOffs = P.second - AnchorAddr.Offset;
1972 
1973  if (TLI->isLegalGlobalAddressingMode(AM)) {
1974  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1975  dbgs() << ")"; P.first->dump());
1976  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1977  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1978  }
1979  }
1980  AnchorList.insert(AnchorInst);
1981  return true;
1982  }
1983 
1984  return false;
1985 }
1986 
1987 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1988  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1989  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1990  if (AddrList.front().InstClass == CI.InstClass &&
1991  AddrList.front().hasSameBaseAddress(*CI.I)) {
1992  AddrList.emplace_back(CI);
1993  return;
1994  }
1995  }
1996 
1997  // Base address not found, so add a new list.
1998  MergeableInsts.emplace_back(1, CI);
1999 }
2000 
2001 std::pair<MachineBasicBlock::iterator, bool>
2002 SILoadStoreOptimizer::collectMergeableInsts(
2004  MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2005  std::list<std::list<CombineInfo>> &MergeableInsts) const {
2006  bool Modified = false;
2007 
2008  // Sort potential mergeable instructions into lists. One list per base address.
2009  unsigned Order = 0;
2010  MachineBasicBlock::iterator BlockI = Begin;
2011  for (; BlockI != End; ++BlockI) {
2012  MachineInstr &MI = *BlockI;
2013 
2014  // We run this before checking if an address is mergeable, because it can produce
2015  // better code even if the instructions aren't mergeable.
2016  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2017  Modified = true;
2018 
2019  // Don't combine if volatile. We also won't be able to merge across this, so
2020  // break the search. We can look after this barrier for separate merges.
2021  if (MI.hasOrderedMemoryRef()) {
2022  LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
2023 
2024  // Search will resume after this instruction in a separate merge list.
2025  ++BlockI;
2026  break;
2027  }
2028 
2029  const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2030  if (InstClass == UNKNOWN)
2031  continue;
2032 
2033  CombineInfo CI;
2034  CI.setMI(MI, *TII, *STM);
2035  CI.Order = Order++;
2036 
2037  if (!CI.hasMergeableAddress(*MRI))
2038  continue;
2039 
2040  LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2041 
2042  addInstToMergeableList(CI, MergeableInsts);
2043  }
2044 
2045  // At this point we have lists of Mergeable instructions.
2046  //
2047  // Part 2: Sort lists by offset and then for each CombineInfo object in the
2048  // list try to find an instruction that can be merged with I. If an instruction
2049  // is found, it is stored in the Paired field. If no instructions are found, then
2050  // the CombineInfo object is deleted from the list.
2051 
2052  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2053  E = MergeableInsts.end(); I != E;) {
2054 
2055  std::list<CombineInfo> &MergeList = *I;
2056  if (MergeList.size() <= 1) {
2057  // This means we have found only one instruction with a given address
2058  // that can be merged, and we need at least 2 instructions to do a merge,
2059  // so this list can be discarded.
2060  I = MergeableInsts.erase(I);
2061  continue;
2062  }
2063 
2064  // Sort the lists by offsets, this way mergeable instructions will be
2065  // adjacent to each other in the list, which will make it easier to find
2066  // matches.
2067  MergeList.sort(
2068  [] (const CombineInfo &A, const CombineInfo &B) {
2069  return A.Offset < B.Offset;
2070  });
2071  ++I;
2072  }
2073 
2074  return std::make_pair(BlockI, Modified);
2075 }
2076 
2077 // Scan through looking for adjacent LDS operations with constant offsets from
2078 // the same base register. We rely on the scheduler to do the hard work of
2079 // clustering nearby loads, and assume these are all adjacent.
2081  std::list<std::list<CombineInfo> > &MergeableInsts) {
2082  bool Modified = false;
2083 
2084  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2085  E = MergeableInsts.end(); I != E;) {
2086  std::list<CombineInfo> &MergeList = *I;
2087 
2088  bool OptimizeListAgain = false;
2089  if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2090  // We weren't able to make any changes, so delete the list so we don't
2091  // process the same instructions the next time we try to optimize this
2092  // block.
2093  I = MergeableInsts.erase(I);
2094  continue;
2095  }
2096 
2097  Modified = true;
2098 
2099  // We made changes, but also determined that there were no more optimization
2100  // opportunities, so we don't need to reprocess the list
2101  if (!OptimizeListAgain) {
2102  I = MergeableInsts.erase(I);
2103  continue;
2104  }
2105  OptimizeAgain = true;
2106  }
2107  return Modified;
2108 }
2109 
2110 bool
2111 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2112  std::list<CombineInfo> &MergeList,
2113  bool &OptimizeListAgain) {
2114  if (MergeList.empty())
2115  return false;
2116 
2117  bool Modified = false;
2118 
2119  for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2120  Next = std::next(I)) {
2121 
2122  auto First = I;
2123  auto Second = Next;
2124 
2125  if ((*First).Order > (*Second).Order)
2126  std::swap(First, Second);
2127  CombineInfo &CI = *First;
2128  CombineInfo &Paired = *Second;
2129 
2130  SmallVector<MachineInstr *, 8> InstsToMove;
2131  if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
2132  ++I;
2133  continue;
2134  }
2135 
2136  Modified = true;
2137 
2138  LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2139 
2140  switch (CI.InstClass) {
2141  default:
2142  llvm_unreachable("unknown InstClass");
2143  break;
2144  case DS_READ: {
2146  mergeRead2Pair(CI, Paired, InstsToMove);
2147  CI.setMI(NewMI, *TII, *STM);
2148  break;
2149  }
2150  case DS_WRITE: {
2152  mergeWrite2Pair(CI, Paired, InstsToMove);
2153  CI.setMI(NewMI, *TII, *STM);
2154  break;
2155  }
2156  case S_BUFFER_LOAD_IMM: {
2158  mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
2159  CI.setMI(NewMI, *TII, *STM);
2160  OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
2161  break;
2162  }
2163  case BUFFER_LOAD: {
2165  mergeBufferLoadPair(CI, Paired, InstsToMove);
2166  CI.setMI(NewMI, *TII, *STM);
2167  OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2168  break;
2169  }
2170  case BUFFER_STORE: {
2172  mergeBufferStorePair(CI, Paired, InstsToMove);
2173  CI.setMI(NewMI, *TII, *STM);
2174  OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2175  break;
2176  }
2177  case MIMG: {
2179  mergeImagePair(CI, Paired, InstsToMove);
2180  CI.setMI(NewMI, *TII, *STM);
2181  OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2182  break;
2183  }
2184  case TBUFFER_LOAD: {
2186  mergeTBufferLoadPair(CI, Paired, InstsToMove);
2187  CI.setMI(NewMI, *TII, *STM);
2188  OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2189  break;
2190  }
2191  case TBUFFER_STORE: {
2193  mergeTBufferStorePair(CI, Paired, InstsToMove);
2194  CI.setMI(NewMI, *TII, *STM);
2195  OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2196  break;
2197  }
2198  }
2199  CI.Order = Paired.Order;
2200  if (I == Second)
2201  I = Next;
2202 
2203  MergeList.erase(Second);
2204  }
2205 
2206  return Modified;
2207 }
2208 
2209 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2210  if (skipFunction(MF.getFunction()))
2211  return false;
2212 
2213  STM = &MF.getSubtarget<GCNSubtarget>();
2214  if (!STM->loadStoreOptEnabled())
2215  return false;
2216 
2217  TII = STM->getInstrInfo();
2218  TRI = &TII->getRegisterInfo();
2219 
2220  MRI = &MF.getRegInfo();
2221  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2222 
2223  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2224 
2225  bool Modified = false;
2226 
2227  // Contains the list of instructions for which constant offsets are being
2228  // promoted to the IMM. This is tracked for an entire block at time.
2229  SmallPtrSet<MachineInstr *, 4> AnchorList;
2230  MemInfoMap Visited;
2231 
2232  for (MachineBasicBlock &MBB : MF) {
2233  MachineBasicBlock::iterator SectionEnd;
2234  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2235  I = SectionEnd) {
2236  bool CollectModified;
2237  std::list<std::list<CombineInfo>> MergeableInsts;
2238 
2239  // First pass: Collect list of all instructions we know how to merge in a
2240  // subset of the block.
2241  std::tie(SectionEnd, CollectModified) =
2242  collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2243 
2244  Modified |= CollectModified;
2245 
2246  do {
2247  OptimizeAgain = false;
2248  Modified |= optimizeBlock(MergeableInsts);
2249  } while (OptimizeAgain);
2250  }
2251 
2252  Visited.clear();
2253  AnchorList.clear();
2254  }
2255 
2256  return Modified;
2257 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SILoadStoreOptimizer.cpp:69
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:105
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:23
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::AMDGPU::getMUBUFHasSoffset
bool getMUBUFHasSoffset(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:284
Reg
unsigned Reg
Definition: MachineSink.cpp:1558
moveInstsAfter
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr * > InstsToMove)
Definition: SILoadStoreOptimizer.cpp:581
llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition: MachineOperand.h:791
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
memAccessesCanBeReordered
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, AliasAnalysis *AA)
Definition: SILoadStoreOptimizer.cpp:604
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:50
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::AMDGPU::getMUBUFBaseOpcode
int getMUBUFBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:259
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::AMDGPUISD::BUFFER_LOAD
@ BUFFER_LOAD
Definition: AMDGPUISelLowering.h:507
op
#define op(i)
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:431
llvm::MipsISD::Lo
@ Lo
Definition: MipsISelLowering.h:79
llvm::MachineRegisterInfo::getUniqueVRegDef
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition: MachineRegisterInfo.cpp:409
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
addToListsIfDependent
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &PhysRegUses, SmallVectorImpl< MachineInstr * > &Insts)
Definition: SILoadStoreOptimizer.cpp:615
llvm::MemOp
Definition: TargetLowering.h:112
canMoveInstsAcrossMemOp
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr * > InstsToMove, AliasAnalysis *AA)
Definition: SILoadStoreOptimizer.cpp:640
llvm::AMDGPU::getGlobalSaddrOp
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:128
llvm::GCNSubtarget::hasDwordx3LoadStores
bool hasDwordx3LoadStores() const
Definition: GCNSubtarget.h:910
llvm::MachineFunctionProperties::Property::IsSSA
@ IsSSA
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:111
llvm::Optional
Definition: APInt.h:33
llvm::createSILoadStoreOptimizerPass
FunctionPass * createSILoadStoreOptimizerPass()
Definition: SILoadStoreOptimizer.cpp:577
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:48
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::GCNSubtarget::loadStoreOptEnabled
bool loadStoreOptEnabled() const
Definition: GCNSubtarget.h:887
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1559
llvm::AMDGPU::getMTBUFOpcode
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition: AMDGPUBaseInfo.cpp:234
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:102
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::GCNSubtarget::getInstrInfo
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:207
llvm::MipsISD::Hi
@ Hi
Definition: MipsISelLowering.h:75
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::Register::isPhysical
bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:97
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:292
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:644
llvm::AMDGPU::getMTBUFHasSrsrc
bool getMTBUFHasSrsrc(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:249
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::AAResults
Definition: AliasAnalysis.h:508
llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition: MachineOperand.h:773
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:537
llvm::AMDGPU::getMIMGBaseOpcode
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:145
llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition: TargetLowering.h:2380
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:215
llvm::AMDGPU::getMUBUFOpcode
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition: AMDGPUBaseInfo.cpp:264
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::AMDGPU::GcnBufferFormatInfo::NumFormat
unsigned NumFormat
Definition: AMDGPUBaseInfo.h:58
false
Definition: StackSlotColoring.cpp:142
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:195
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:173
llvm::AMDGPU::getMTBUFBaseOpcode
int getMTBUFBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:229
llvm::AMDGPU::getMTBUFHasSoffset
bool getMTBUFHasSoffset(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:254
LoopDeletionResult::Modified
@ Modified
llvm::SITargetLowering::isLegalGlobalAddressingMode
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Definition: SIISelLowering.cpp:1307
llvm::SIInstrFlags::MIMG
@ MIMG
Definition: SIDefines.h:57
llvm::AMDGPU::getMUBUFHasSrsrc
bool getMUBUFHasSrsrc(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:279
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::AMDGPU::convertSMRDOffsetUnits
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
Definition: AMDGPUBaseInfo.cpp:1880
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::SILoadStoreOptimizerID
char & SILoadStoreOptimizerID
Definition: SILoadStoreOptimizer.cpp:575
llvm::None
const NoneType None
Definition: None.h:23
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:634
llvm::countPopulation
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Definition: MathExtras.h:567
AMDGPUMCTargetDesc.h
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:739
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:321
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:69
uint64_t
Addr
uint64_t Addr
Definition: ELFObjHandler.cpp:80
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
Optimizer
SI Load Store Optimizer
Definition: SILoadStoreOptimizer.cpp:570
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::AMDGPU::GcnBufferFormatInfo::BitsPerComp
unsigned BitsPerComp
Definition: AMDGPUBaseInfo.h:56
llvm::AMDGPU::getGcnBufferFormatInfo
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:2020
llvm::MachineMemOperand::setOffset
void setOffset(int64_t NewOffset)
Definition: MachineMemOperand.h:320
MachineFunctionPass.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
mostAlignedValueInRange
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
Definition: SILoadStoreOptimizer.cpp:737
llvm::isUInt< 8 >
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:405
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:840
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:225
llvm::MachineInstrBuilder::addMemOperand
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Definition: MachineInstrBuilder.h:202
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
llvm::AMDGPU::getMIMGInfo
const LLVM_READONLY MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:360
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::MachineFunction
Definition: MachineFunction.h:234
llvm::MachineInstr::dump
void dump() const
Definition: MachineInstr.cpp:1541
llvm::AMDGPU::getMIMGBaseOpcodeInfo
const LLVM_READONLY MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::SmallPtrSetImplBase::clear
void clear()
Definition: SmallPtrSet.h:94
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
da
da
Definition: DependenceAnalysis.cpp:146
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::countTrailingZeros
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: MathExtras.h:156
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
AMDGPU.h
list
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing list
Definition: README.txt:568
MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition: AArch64SLSHardening.cpp:75
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:489
llvm::MachineRegisterInfo::hasOneNonDBGUse
bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
Definition: MachineRegisterInfo.cpp:417
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
uint32_t
llvm::RegState::Define
@ Define
Register definition.
Definition: MachineInstrBuilder.h:44
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::format
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
llvm::AMDGPU::getMTBUFHasVAddr
bool getMTBUFHasVAddr(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:244
llvm::Pass::dump
void dump() const
Definition: Pass.cpp:131
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:286
llvm::AMDGPU::GcnBufferFormatInfo
Definition: AMDGPUBaseInfo.h:54
llvm::MachineInstrBuilder::cloneMergedMemRefs
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
Definition: MachineInstrBuilder.h:219
llvm::AMDGPU::GcnBufferFormatInfo::Format
unsigned Format
Definition: AMDGPUBaseInfo.h:55
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition: MachineOperand.h:365
llvm::LPAC::UNKNOWN
@ UNKNOWN
Definition: LanaiAluCode.h:40
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
getBufferFormatWithCompCount
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
Definition: SILoadStoreOptimizer.cpp:706
llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition: TargetLowering.h:2379
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:600
llvm::AMDGPU::getMUBUFHasVAddr
bool getMUBUFHasVAddr(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:274
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:276
llvm::AMDGPU::getMUBUFElements
int getMUBUFElements(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:269
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:325
llvm::MachineBasicBlock::insert
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
Definition: MachineBasicBlock.cpp:1314
optimizeBlock
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
Definition: ScalarizeMaskedMemIntrin.cpp:911
llvm::SITargetLowering
Definition: SIISelLowering.h:31
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::GCNSubtarget::ldsRequiresM0Init
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:603
llvm::AMDGPU::MIMGInfo
Definition: AMDGPUBaseInfo.h:359
llvm::countLeadingZeros
unsigned countLeadingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: MathExtras.h:225
llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition: MachineOperand.h:323
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:414
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
addDefsUsesToList
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &PhysRegUses)
Definition: SILoadStoreOptimizer.cpp:591
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::pdb::DbgHeaderType::Max
@ Max
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:268
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1336
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::initializeSILoadStoreOptimizerPass
void initializeSILoadStoreOptimizerPass(PassRegistry &)
llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
Definition: TargetLowering.h:2377
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
llvm::SmallVectorImpl< MachineInstr * >
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::AMDGPUISD::BUFFER_STORE
@ BUFFER_STORE
Definition: AMDGPUISelLowering.h:515
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
combineKnownAdjacentMMOs
static MachineMemOperand * combineKnownAdjacentMMOs(MachineFunction &MF, const MachineMemOperand *A, const MachineMemOperand *B)
Definition: SILoadStoreOptimizer.cpp:656
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::MachineInstrBundleIterator
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
Definition: MachineInstrBundleIterator.h:108
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1282
InitializePasses.h
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:270
llvm::AMDGPU::getMaskedMIMGOp
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
Definition: AMDGPUBaseInfo.cpp:150
llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition: AMDGPUBaseInfo.h:285
llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58
llvm::AMDGPU::getMTBUFElements
int getMTBUFElements(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:239
getReg
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
Definition: MipsDisassembler.cpp:572
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38