LLVM  10.0.0svn
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "AMDGPUSubtarget.h"
63 #include "SIInstrInfo.h"
64 #include "SIRegisterInfo.h"
65 #include "Utils/AMDGPUBaseInfo.h"
66 #include "llvm/ADT/ArrayRef.h"
67 #include "llvm/ADT/SmallVector.h"
68 #include "llvm/ADT/StringRef.h"
77 #include "llvm/IR/DebugLoc.h"
78 #include "llvm/Pass.h"
79 #include "llvm/Support/Debug.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <iterator>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "si-load-store-opt"
91 
92 namespace {
94  UNKNOWN,
95  DS_READ,
96  DS_WRITE,
97  S_BUFFER_LOAD_IMM,
100  MIMG,
101 };
102 
104  SBASE = 0x1,
105  SRSRC = 0x2,
106  SOFFSET = 0x4,
107  VADDR = 0x8,
108  ADDR = 0x10,
109  SSAMP = 0x20,
110 };
111 
112 class SILoadStoreOptimizer : public MachineFunctionPass {
113  struct CombineInfo {
116  unsigned EltSize;
117  unsigned Offset0;
118  unsigned Offset1;
119  unsigned Width0;
120  unsigned Width1;
121  unsigned BaseOff;
122  unsigned DMask0;
123  unsigned DMask1;
124  InstClassEnum InstClass;
125  bool GLC0;
126  bool GLC1;
127  bool SLC0;
128  bool SLC1;
129  bool DLC0;
130  bool DLC1;
131  bool UseST64;
132  SmallVector<MachineInstr *, 8> InstsToMove;
133  int AddrIdx[5];
134  const MachineOperand *AddrReg[5];
135  unsigned NumAddresses;
136 
137  bool hasSameBaseAddress(const MachineInstr &MI) {
138  for (unsigned i = 0; i < NumAddresses; i++) {
139  const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
140 
141  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
142  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
143  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
144  return false;
145  }
146  continue;
147  }
148 
149  // Check same base pointer. Be careful of subregisters, which can occur
150  // with vectors of pointers.
151  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
152  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
153  return false;
154  }
155  }
156  return true;
157  }
158 
159  bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
160  for (unsigned i = 0; i < NumAddresses; ++i) {
161  const MachineOperand *AddrOp = AddrReg[i];
162  // Immediates are always OK.
163  if (AddrOp->isImm())
164  continue;
165 
166  // Don't try to merge addresses that aren't either immediates or registers.
167  // TODO: Should be possible to merge FrameIndexes and maybe some other
168  // non-register
169  if (!AddrOp->isReg())
170  return false;
171 
172  // TODO: We should be able to merge physical reg addreses.
173  if (Register::isPhysicalRegister(AddrOp->getReg()))
174  return false;
175 
176  // If an address has only one use then there will be on other
177  // instructions with the same address, so we can't merge this one.
178  if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
179  return false;
180  }
181  return true;
182  }
183 
184  void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
185  const GCNSubtarget &STM);
186  void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
187  };
188 
189  struct BaseRegisters {
190  unsigned LoReg = 0;
191  unsigned HiReg = 0;
192 
193  unsigned LoSubReg = 0;
194  unsigned HiSubReg = 0;
195  };
196 
197  struct MemAddress {
198  BaseRegisters Base;
199  int64_t Offset = 0;
200  };
201 
202  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
203 
204 private:
205  const GCNSubtarget *STM = nullptr;
206  const SIInstrInfo *TII = nullptr;
207  const SIRegisterInfo *TRI = nullptr;
208  MachineRegisterInfo *MRI = nullptr;
209  AliasAnalysis *AA = nullptr;
210  bool OptimizeAgain;
211 
212  static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII);
213  static bool offsetsCanBeCombined(CombineInfo &CI);
214  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
215  static unsigned getNewOpcode(const CombineInfo &CI);
216  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
217  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
218 
219  bool findMatchingInst(CombineInfo &CI);
220 
221  unsigned read2Opcode(unsigned EltSize) const;
222  unsigned read2ST64Opcode(unsigned EltSize) const;
223  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
224 
225  unsigned write2Opcode(unsigned EltSize) const;
226  unsigned write2ST64Opcode(unsigned EltSize) const;
227  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
228  MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI);
229  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
230  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
231  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
232 
233  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
234  int32_t NewOffset) const;
235  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
236  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
237  Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
238  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
239  /// Promotes constant offset to the immediate by adjusting the base. It
240  /// tries to use a base from the nearby instructions that allows it to have
241  /// a 13bit constant offset which gets promoted to the immediate.
242  bool promoteConstantOffsetToImm(MachineInstr &CI,
243  MemInfoMap &Visited,
244  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
245  void addInstToMergeableList(const CombineInfo &CI,
246  std::list<std::list<CombineInfo> > &MergeableInsts) const;
247  bool collectMergeableInsts(MachineBasicBlock &MBB,
248  std::list<std::list<CombineInfo> > &MergeableInsts) const;
249 
250 public:
251  static char ID;
252 
253  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
255  }
256 
257  void removeCombinedInst(std::list<CombineInfo> &MergeList,
258  const MachineInstr &MI);
259  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
260  bool &OptimizeListAgain);
261  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
262 
263  bool runOnMachineFunction(MachineFunction &MF) override;
264 
265  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
266 
267  void getAnalysisUsage(AnalysisUsage &AU) const override {
268  AU.setPreservesCFG();
270 
272  }
273 };
274 
275 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
276  const unsigned Opc = MI.getOpcode();
277 
278  if (TII.isMUBUF(Opc)) {
279  // FIXME: Handle d16 correctly
280  return AMDGPU::getMUBUFElements(Opc);
281  }
282  if (TII.isMIMG(MI)) {
283  uint64_t DMaskImm =
284  TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
285  return countPopulation(DMaskImm);
286  }
287 
288  switch (Opc) {
289  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
290  return 1;
291  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
292  return 2;
293  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
294  return 4;
295  default:
296  return 0;
297  }
298 }
299 
300 /// Maps instruction opcode to enum InstClassEnum.
301 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
302  switch (Opc) {
303  default:
304  if (TII.isMUBUF(Opc)) {
305  switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
306  default:
307  return UNKNOWN;
308  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
309  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
310  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
311  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
312  return BUFFER_LOAD;
313  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
314  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
315  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
316  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
317  return BUFFER_STORE;
318  }
319  }
320  if (TII.isMIMG(Opc)) {
321  // Ignore instructions encoded without vaddr.
322  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1)
323  return UNKNOWN;
324  // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
325  if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc))
326  return UNKNOWN;
327  return MIMG;
328  }
329  return UNKNOWN;
330  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
331  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
332  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
333  return S_BUFFER_LOAD_IMM;
334  case AMDGPU::DS_READ_B32:
335  case AMDGPU::DS_READ_B32_gfx9:
336  case AMDGPU::DS_READ_B64:
337  case AMDGPU::DS_READ_B64_gfx9:
338  return DS_READ;
339  case AMDGPU::DS_WRITE_B32:
340  case AMDGPU::DS_WRITE_B32_gfx9:
341  case AMDGPU::DS_WRITE_B64:
342  case AMDGPU::DS_WRITE_B64_gfx9:
343  return DS_WRITE;
344  }
345 }
346 
347 /// Determines instruction subclass from opcode. Only instructions
348 /// of the same subclass can be merged together.
349 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
350  switch (Opc) {
351  default:
352  if (TII.isMUBUF(Opc))
353  return AMDGPU::getMUBUFBaseOpcode(Opc);
354  if (TII.isMIMG(Opc)) {
355  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
356  assert(Info);
357  return Info->BaseOpcode;
358  }
359  return -1;
360  case AMDGPU::DS_READ_B32:
361  case AMDGPU::DS_READ_B32_gfx9:
362  case AMDGPU::DS_READ_B64:
363  case AMDGPU::DS_READ_B64_gfx9:
364  case AMDGPU::DS_WRITE_B32:
365  case AMDGPU::DS_WRITE_B32_gfx9:
366  case AMDGPU::DS_WRITE_B64:
367  case AMDGPU::DS_WRITE_B64_gfx9:
368  return Opc;
369  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
370  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
371  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
372  return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
373  }
374 }
375 
376 static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
377  if (TII.isMUBUF(Opc)) {
378  unsigned result = 0;
379 
380  if (AMDGPU::getMUBUFHasVAddr(Opc)) {
381  result |= VADDR;
382  }
383 
384  if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
385  result |= SRSRC;
386  }
387 
388  if (AMDGPU::getMUBUFHasSoffset(Opc)) {
389  result |= SOFFSET;
390  }
391 
392  return result;
393  }
394 
395  if (TII.isMIMG(Opc)) {
396  unsigned result = VADDR | SRSRC;
397  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
398  if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
399  result |= SSAMP;
400  return result;
401  }
402 
403  switch (Opc) {
404  default:
405  return 0;
406  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
407  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
408  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
409  return SBASE;
410  case AMDGPU::DS_READ_B32:
411  case AMDGPU::DS_READ_B64:
412  case AMDGPU::DS_READ_B32_gfx9:
413  case AMDGPU::DS_READ_B64_gfx9:
414  case AMDGPU::DS_WRITE_B32:
415  case AMDGPU::DS_WRITE_B64:
416  case AMDGPU::DS_WRITE_B32_gfx9:
417  case AMDGPU::DS_WRITE_B64_gfx9:
418  return ADDR;
419  }
420 }
421 
422 
423 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
424  const SIInstrInfo &TII,
425  const GCNSubtarget &STM) {
426  I = MI;
427  unsigned Opc = MI->getOpcode();
428  InstClass = getInstClass(Opc, TII);
429 
430  if (InstClass == UNKNOWN)
431  return;
432 
433  switch (InstClass) {
434  case DS_READ:
435  EltSize =
436  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
437  : 4;
438  break;
439  case DS_WRITE:
440  EltSize =
441  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
442  : 4;
443  break;
444  case S_BUFFER_LOAD_IMM:
445  EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);
446  break;
447  default:
448  EltSize = 4;
449  break;
450  }
451 
452  if (InstClass == MIMG) {
453  DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
454  } else {
455  int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
456  Offset0 = I->getOperand(OffsetIdx).getImm();
457  }
458 
459  Width0 = getOpcodeWidth(*I, TII);
460 
461  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
462  Offset0 &= 0xffff;
463  } else if (InstClass != MIMG) {
464  GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
465  if (InstClass != S_BUFFER_LOAD_IMM) {
466  SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
467  }
468  DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
469  }
470 
471  unsigned AddrOpName[5] = {0};
472  NumAddresses = 0;
473  const unsigned Regs = getRegs(I->getOpcode(), TII);
474 
475  if (Regs & ADDR) {
476  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
477  }
478 
479  if (Regs & SBASE) {
480  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
481  }
482 
483  if (Regs & SRSRC) {
484  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
485  }
486 
487  if (Regs & SOFFSET) {
488  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
489  }
490 
491  if (Regs & VADDR) {
492  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
493  }
494 
495  if (Regs & SSAMP) {
496  AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp;
497  }
498 
499  for (unsigned i = 0; i < NumAddresses; i++) {
500  AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
501  AddrReg[i] = &I->getOperand(AddrIdx[i]);
502  }
503 
504  InstsToMove.clear();
505 }
506 
507 void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
508  const SIInstrInfo &TII) {
509  Paired = MI;
510  assert(InstClass == getInstClass(Paired->getOpcode(), TII));
511 
512  if (InstClass == MIMG) {
513  DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm();
514  } else {
515  int OffsetIdx =
516  AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset);
517  Offset1 = Paired->getOperand(OffsetIdx).getImm();
518  }
519 
520  Width1 = getOpcodeWidth(*Paired, TII);
521  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
522  Offset1 &= 0xffff;
523  } else if (InstClass != MIMG) {
524  GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm();
525  if (InstClass != S_BUFFER_LOAD_IMM) {
526  SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm();
527  }
528  DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm();
529  }
530 }
531 
532 
533 } // end anonymous namespace.
534 
535 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
536  "SI Load Store Optimizer", false, false)
539  false, false)
540 
541 char SILoadStoreOptimizer::ID = 0;
542 
543 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
544 
546  return new SILoadStoreOptimizer();
547 }
548 
550  ArrayRef<MachineInstr *> InstsToMove) {
551  MachineBasicBlock *MBB = I->getParent();
552  ++I;
553  for (MachineInstr *MI : InstsToMove) {
554  MI->removeFromParent();
555  MBB->insert(I, MI);
556  }
557 }
558 
559 static void addDefsUsesToList(const MachineInstr &MI,
560  DenseSet<unsigned> &RegDefs,
561  DenseSet<unsigned> &PhysRegUses) {
562  for (const MachineOperand &Op : MI.operands()) {
563  if (Op.isReg()) {
564  if (Op.isDef())
565  RegDefs.insert(Op.getReg());
566  else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
567  PhysRegUses.insert(Op.getReg());
568  }
569  }
570 }
571 
574  AliasAnalysis *AA) {
575  // RAW or WAR - cannot reorder
576  // WAW - cannot reorder
577  // RAR - safe to reorder
578  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
579 }
580 
581 // Add MI and its defs to the lists if MI reads one of the defs that are
582 // already in the list. Returns true in that case.
584  DenseSet<unsigned> &PhysRegUses,
586  for (MachineOperand &Use : MI.operands()) {
587  // If one of the defs is read, then there is a use of Def between I and the
588  // instruction that I will potentially be merged with. We will need to move
589  // this instruction after the merged instructions.
590  //
591  // Similarly, if there is a def which is read by an instruction that is to
592  // be moved for merging, then we need to move the def-instruction as well.
593  // This can only happen for physical registers such as M0; virtual
594  // registers are in SSA form.
595  if (Use.isReg() &&
596  ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
597  (Use.isDef() && RegDefs.count(Use.getReg())) ||
598  (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
599  PhysRegUses.count(Use.getReg())))) {
600  Insts.push_back(&MI);
601  addDefsUsesToList(MI, RegDefs, PhysRegUses);
602  return true;
603  }
604  }
605 
606  return false;
607 }
608 
610  ArrayRef<MachineInstr *> InstsToMove,
611  AliasAnalysis *AA) {
612  assert(MemOp.mayLoadOrStore());
613 
614  for (MachineInstr *InstToMove : InstsToMove) {
615  if (!InstToMove->mayLoadOrStore())
616  continue;
617  if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
618  return false;
619  }
620  return true;
621 }
622 
623 // This function assumes that \p A and \p B have are identical except for
624 // size and offset, and they referecne adjacent memory.
626  const MachineMemOperand *A,
627  const MachineMemOperand *B) {
628  unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
629  unsigned Size = A->getSize() + B->getSize();
630  // This function adds the offset parameter to the existing offset for A,
631  // so we pass 0 here as the offset and then manually set it to the correct
632  // value after the call.
633  MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
634  MMO->setOffset(MinOffset);
635  return MMO;
636 }
637 
638 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) {
639  assert(CI.InstClass == MIMG);
640 
641  // Ignore instructions with tfe/lwe set.
642  const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
643  const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
644 
645  if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
646  return false;
647 
648  // Check other optional immediate operands for equality.
649  unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
650  AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
651  AMDGPU::OpName::da, AMDGPU::OpName::r128};
652 
653  for (auto op : OperandsToMatch) {
654  int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
655  if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx)
656  return false;
657  if (Idx != -1 &&
658  CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm())
659  return false;
660  }
661 
662  // Check DMask for overlaps.
663  unsigned MaxMask = std::max(CI.DMask0, CI.DMask1);
664  unsigned MinMask = std::min(CI.DMask0, CI.DMask1);
665 
666  unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
667  if ((1u << AllowedBitsForMin) <= MinMask)
668  return false;
669 
670  return true;
671 }
672 
673 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
674  assert(CI.InstClass != MIMG);
675 
676  // XXX - Would the same offset be OK? Is there any reason this would happen or
677  // be useful?
678  if (CI.Offset0 == CI.Offset1)
679  return false;
680 
681  // This won't be valid if the offset isn't aligned.
682  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
683  return false;
684 
685  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
686  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
687  CI.UseST64 = false;
688  CI.BaseOff = 0;
689 
690  // Handle SMEM and VMEM instructions.
691  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
692  return (EltOffset0 + CI.Width0 == EltOffset1 ||
693  EltOffset1 + CI.Width1 == EltOffset0) &&
694  CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
695  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
696  }
697 
698  // If the offset in elements doesn't fit in 8-bits, we might be able to use
699  // the stride 64 versions.
700  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
701  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
702  CI.Offset0 = EltOffset0 / 64;
703  CI.Offset1 = EltOffset1 / 64;
704  CI.UseST64 = true;
705  return true;
706  }
707 
708  // Check if the new offsets fit in the reduced 8-bit range.
709  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
710  CI.Offset0 = EltOffset0;
711  CI.Offset1 = EltOffset1;
712  return true;
713  }
714 
715  // Try to shift base address to decrease offsets.
716  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
717  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
718 
719  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
720  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
721  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
722  CI.UseST64 = true;
723  return true;
724  }
725 
726  if (isUInt<8>(OffsetDiff)) {
727  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
728  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
729  return true;
730  }
731 
732  return false;
733 }
734 
735 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
736  const CombineInfo &CI) {
737  const unsigned Width = (CI.Width0 + CI.Width1);
738  switch (CI.InstClass) {
739  default:
740  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
741  case S_BUFFER_LOAD_IMM:
742  switch (Width) {
743  default:
744  return false;
745  case 2:
746  case 4:
747  return true;
748  }
749  }
750 }
751 
752 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
753  MachineBasicBlock *MBB = CI.I->getParent();
755  MachineBasicBlock::iterator MBBI = CI.I;
756 
757  const unsigned Opc = CI.I->getOpcode();
758  const InstClassEnum InstClass = getInstClass(Opc, *TII);
759 
760  if (InstClass == UNKNOWN) {
761  return false;
762  }
763  const unsigned InstSubclass = getInstSubclass(Opc, *TII);
764 
765  // Do not merge VMEM buffer instructions with "swizzled" bit set.
766  int Swizzled =
767  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
768  if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
769  return false;
770 
771  ++MBBI;
772 
773  DenseSet<unsigned> RegDefsToMove;
774  DenseSet<unsigned> PhysRegUsesToMove;
775  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
776 
777  for (; MBBI != E; ++MBBI) {
778 
779  if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
780  (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
781  // This is not a matching instruction, but we can keep looking as
782  // long as one of these conditions are met:
783  // 1. It is safe to move I down past MBBI.
784  // 2. It is safe to move MBBI down past the instruction that I will
785  // be merged into.
786 
787  if (MBBI->hasUnmodeledSideEffects()) {
788  // We can't re-order this instruction with respect to other memory
789  // operations, so we fail both conditions mentioned above.
790  return false;
791  }
792 
793  if (MBBI->mayLoadOrStore() &&
794  (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
795  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
796  // We fail condition #1, but we may still be able to satisfy condition
797  // #2. Add this instruction to the move list and then we will check
798  // if condition #2 holds once we have selected the matching instruction.
799  CI.InstsToMove.push_back(&*MBBI);
800  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
801  continue;
802  }
803 
804  // When we match I with another DS instruction we will be moving I down
805  // to the location of the matched instruction any uses of I will need to
806  // be moved down as well.
807  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
808  CI.InstsToMove);
809  continue;
810  }
811 
812  // Don't merge volatiles.
813  if (MBBI->hasOrderedMemoryRef())
814  return false;
815 
816  // Handle a case like
817  // DS_WRITE_B32 addr, v, idx0
818  // w = DS_READ_B32 addr, idx0
819  // DS_WRITE_B32 addr, f(w), idx1
820  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
821  // merging of the two writes.
822  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
823  CI.InstsToMove))
824  continue;
825 
826  bool Match = CI.hasSameBaseAddress(*MBBI);
827 
828  if (Match) {
829  CI.setPaired(MBBI, *TII);
830 
831  // Check both offsets (or masks for MIMG) can be combined and fit in the
832  // reduced range.
833  bool canBeCombined =
834  CI.InstClass == MIMG
835  ? dmasksCanBeCombined(CI, *TII)
836  : widthsFit(*STM, CI) && offsetsCanBeCombined(CI);
837 
838  // We also need to go through the list of instructions that we plan to
839  // move and make sure they are all safe to move down past the merged
840  // instruction.
841  if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
842  return true;
843  }
844 
845  // We've found a load/store that we couldn't merge for some reason.
846  // We could potentially keep looking, but we'd need to make sure that
847  // it was safe to move I and also all the instruction in InstsToMove
848  // down past this instruction.
849  // check if we can move I across MBBI and if we can move all I's users
850  if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
851  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
852  break;
853  }
854  return false;
855 }
856 
857 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
858  if (STM->ldsRequiresM0Init())
859  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
860  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
861 }
862 
863 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
864  if (STM->ldsRequiresM0Init())
865  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
866 
867  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
868  : AMDGPU::DS_READ2ST64_B64_gfx9;
869 }
870 
872 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
873  MachineBasicBlock *MBB = CI.I->getParent();
874 
875  // Be careful, since the addresses could be subregisters themselves in weird
876  // cases, like vectors of pointers.
877  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
878 
879  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
880  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
881 
882  unsigned NewOffset0 = CI.Offset0;
883  unsigned NewOffset1 = CI.Offset1;
884  unsigned Opc =
885  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
886 
887  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
888  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
889 
890  if (NewOffset0 > NewOffset1) {
891  // Canonicalize the merged instruction so the smaller offset comes first.
892  std::swap(NewOffset0, NewOffset1);
893  std::swap(SubRegIdx0, SubRegIdx1);
894  }
895 
896  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
897  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
898 
899  const MCInstrDesc &Read2Desc = TII->get(Opc);
900 
901  const TargetRegisterClass *SuperRC =
902  (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
903  Register DestReg = MRI->createVirtualRegister(SuperRC);
904 
905  DebugLoc DL = CI.I->getDebugLoc();
906 
907  Register BaseReg = AddrReg->getReg();
908  unsigned BaseSubReg = AddrReg->getSubReg();
909  unsigned BaseRegFlags = 0;
910  if (CI.BaseOff) {
911  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
912  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
913  .addImm(CI.BaseOff);
914 
915  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
916  BaseRegFlags = RegState::Kill;
917 
918  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
919  .addReg(ImmReg)
920  .addReg(AddrReg->getReg(), 0, BaseSubReg)
921  .addImm(0); // clamp bit
922  BaseSubReg = 0;
923  }
924 
925  MachineInstrBuilder Read2 =
926  BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
927  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
928  .addImm(NewOffset0) // offset0
929  .addImm(NewOffset1) // offset1
930  .addImm(0) // gds
931  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
932 
933  (void)Read2;
934 
935  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
936 
937  // Copy to the old destination registers.
938  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
939  .add(*Dest0) // Copy to same destination including flags and sub reg.
940  .addReg(DestReg, 0, SubRegIdx0);
941  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
942  .add(*Dest1)
943  .addReg(DestReg, RegState::Kill, SubRegIdx1);
944 
945  moveInstsAfter(Copy1, CI.InstsToMove);
946 
947  CI.I->eraseFromParent();
948  CI.Paired->eraseFromParent();
949 
950  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
951  return Read2;
952 }
953 
954 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
955  if (STM->ldsRequiresM0Init())
956  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
957  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
958  : AMDGPU::DS_WRITE2_B64_gfx9;
959 }
960 
961 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
962  if (STM->ldsRequiresM0Init())
963  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
964  : AMDGPU::DS_WRITE2ST64_B64;
965 
966  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
967  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
968 }
969 
971 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
972  MachineBasicBlock *MBB = CI.I->getParent();
973 
974  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
975  // sure we preserve the subregister index and any register flags set on them.
976  const MachineOperand *AddrReg =
977  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
978  const MachineOperand *Data0 =
979  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
980  const MachineOperand *Data1 =
981  TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
982 
983  unsigned NewOffset0 = CI.Offset0;
984  unsigned NewOffset1 = CI.Offset1;
985  unsigned Opc =
986  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
987 
988  if (NewOffset0 > NewOffset1) {
989  // Canonicalize the merged instruction so the smaller offset comes first.
990  std::swap(NewOffset0, NewOffset1);
991  std::swap(Data0, Data1);
992  }
993 
994  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
995  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
996 
997  const MCInstrDesc &Write2Desc = TII->get(Opc);
998  DebugLoc DL = CI.I->getDebugLoc();
999 
1000  Register BaseReg = AddrReg->getReg();
1001  unsigned BaseSubReg = AddrReg->getSubReg();
1002  unsigned BaseRegFlags = 0;
1003  if (CI.BaseOff) {
1004  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1005  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1006  .addImm(CI.BaseOff);
1007 
1008  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1009  BaseRegFlags = RegState::Kill;
1010 
1011  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
1012  .addReg(ImmReg)
1013  .addReg(AddrReg->getReg(), 0, BaseSubReg)
1014  .addImm(0); // clamp bit
1015  BaseSubReg = 0;
1016  }
1017 
1018  MachineInstrBuilder Write2 =
1019  BuildMI(*MBB, CI.Paired, DL, Write2Desc)
1020  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1021  .add(*Data0) // data0
1022  .add(*Data1) // data1
1023  .addImm(NewOffset0) // offset0
1024  .addImm(NewOffset1) // offset1
1025  .addImm(0) // gds
1026  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1027 
1028  moveInstsAfter(Write2, CI.InstsToMove);
1029 
1030  CI.I->eraseFromParent();
1031  CI.Paired->eraseFromParent();
1032 
1033  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1034  return Write2;
1035 }
1036 
1038 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) {
1039  MachineBasicBlock *MBB = CI.I->getParent();
1040  DebugLoc DL = CI.I->getDebugLoc();
1041  const unsigned Opcode = getNewOpcode(CI);
1042 
1043  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1044 
1045  Register DestReg = MRI->createVirtualRegister(SuperRC);
1046  unsigned MergedDMask = CI.DMask0 | CI.DMask1;
1047  unsigned DMaskIdx =
1048  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1049 
1050  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
1051  for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1052  if (I == DMaskIdx)
1053  MIB.addImm(MergedDMask);
1054  else
1055  MIB.add((*CI.I).getOperand(I));
1056  }
1057 
1058  // It shouldn't be possible to get this far if the two instructions
1059  // don't have a single memoperand, because MachineInstr::mayAlias()
1060  // will return true if this is the case.
1061  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1062 
1063  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1064  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1065 
1066  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1067 
1068  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1069  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1070  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1071 
1072  // Copy to the old destination registers.
1073  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1074  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1075  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1076 
1077  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1078  .add(*Dest0) // Copy to same destination including flags and sub reg.
1079  .addReg(DestReg, 0, SubRegIdx0);
1080  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1081  .add(*Dest1)
1082  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1083 
1084  moveInstsAfter(Copy1, CI.InstsToMove);
1085 
1086  CI.I->eraseFromParent();
1087  CI.Paired->eraseFromParent();
1088  return New;
1089 }
1090 
1092 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
1093  MachineBasicBlock *MBB = CI.I->getParent();
1094  DebugLoc DL = CI.I->getDebugLoc();
1095  const unsigned Opcode = getNewOpcode(CI);
1096 
1097  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1098 
1099  Register DestReg = MRI->createVirtualRegister(SuperRC);
1100  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
1101 
1102  // It shouldn't be possible to get this far if the two instructions
1103  // don't have a single memoperand, because MachineInstr::mayAlias()
1104  // will return true if this is the case.
1105  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1106 
1107  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1108  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1109 
1110  MachineInstr *New =
1111  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
1112  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1113  .addImm(MergedOffset) // offset
1114  .addImm(CI.GLC0) // glc
1115  .addImm(CI.DLC0) // dlc
1116  .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1117 
1118  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1119  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1120  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1121 
1122  // Copy to the old destination registers.
1123  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1124  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1125  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
1126 
1127  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1128  .add(*Dest0) // Copy to same destination including flags and sub reg.
1129  .addReg(DestReg, 0, SubRegIdx0);
1130  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1131  .add(*Dest1)
1132  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1133 
1134  moveInstsAfter(Copy1, CI.InstsToMove);
1135 
1136  CI.I->eraseFromParent();
1137  CI.Paired->eraseFromParent();
1138  return New;
1139 }
1140 
1142 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
1143  MachineBasicBlock *MBB = CI.I->getParent();
1144  DebugLoc DL = CI.I->getDebugLoc();
1145 
1146  const unsigned Opcode = getNewOpcode(CI);
1147 
1148  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1149 
1150  // Copy to the new source register.
1151  Register DestReg = MRI->createVirtualRegister(SuperRC);
1152  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
1153 
1154  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
1155 
1156  const unsigned Regs = getRegs(Opcode, *TII);
1157 
1158  if (Regs & VADDR)
1159  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1160 
1161  // It shouldn't be possible to get this far if the two instructions
1162  // don't have a single memoperand, because MachineInstr::mayAlias()
1163  // will return true if this is the case.
1164  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1165 
1166  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1167  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1168 
1169  MachineInstr *New =
1170  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1171  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1172  .addImm(MergedOffset) // offset
1173  .addImm(CI.GLC0) // glc
1174  .addImm(CI.SLC0) // slc
1175  .addImm(0) // tfe
1176  .addImm(CI.DLC0) // dlc
1177  .addImm(0) // swz
1178  .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1179 
1180  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1181  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1182  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1183 
1184  // Copy to the old destination registers.
1185  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1186  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1187  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1188 
1189  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1190  .add(*Dest0) // Copy to same destination including flags and sub reg.
1191  .addReg(DestReg, 0, SubRegIdx0);
1192  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1193  .add(*Dest1)
1194  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1195 
1196  moveInstsAfter(Copy1, CI.InstsToMove);
1197 
1198  CI.I->eraseFromParent();
1199  CI.Paired->eraseFromParent();
1200  return New;
1201 }
1202 
1203 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
1204  const unsigned Width = CI.Width0 + CI.Width1;
1205 
1206  switch (CI.InstClass) {
1207  default:
1208  assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1209  // FIXME: Handle d16 correctly
1210  return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1211  Width);
1212  case UNKNOWN:
1213  llvm_unreachable("Unknown instruction class");
1214  case S_BUFFER_LOAD_IMM:
1215  switch (Width) {
1216  default:
1217  return 0;
1218  case 2:
1219  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1220  case 4:
1221  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1222  }
1223  case MIMG:
1224  assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width));
1225  return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1226  }
1227 }
1228 
1229 std::pair<unsigned, unsigned>
1230 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
1231 
1232  if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4)
1233  return std::make_pair(0, 0);
1234 
1235  bool ReverseOrder;
1236  if (CI.InstClass == MIMG) {
1237  assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) &&
1238  "No overlaps");
1239  ReverseOrder = CI.DMask0 > CI.DMask1;
1240  } else
1241  ReverseOrder = CI.Offset0 > CI.Offset1;
1242 
1243  static const unsigned Idxs[4][4] = {
1244  {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1245  {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1246  {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1247  {AMDGPU::sub3, 0, 0, 0},
1248  };
1249  unsigned Idx0;
1250  unsigned Idx1;
1251 
1252  assert(CI.Width0 >= 1 && CI.Width0 <= 3);
1253  assert(CI.Width1 >= 1 && CI.Width1 <= 3);
1254 
1255  if (ReverseOrder) {
1256  Idx1 = Idxs[0][CI.Width1 - 1];
1257  Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
1258  } else {
1259  Idx0 = Idxs[0][CI.Width0 - 1];
1260  Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
1261  }
1262 
1263  return std::make_pair(Idx0, Idx1);
1264 }
1265 
1266 const TargetRegisterClass *
1267 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1268  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1269  switch (CI.Width0 + CI.Width1) {
1270  default:
1271  return nullptr;
1272  case 2:
1273  return &AMDGPU::SReg_64_XEXECRegClass;
1274  case 4:
1275  return &AMDGPU::SGPR_128RegClass;
1276  case 8:
1277  return &AMDGPU::SReg_256RegClass;
1278  case 16:
1279  return &AMDGPU::SReg_512RegClass;
1280  }
1281  } else {
1282  switch (CI.Width0 + CI.Width1) {
1283  default:
1284  return nullptr;
1285  case 2:
1286  return &AMDGPU::VReg_64RegClass;
1287  case 3:
1288  return &AMDGPU::VReg_96RegClass;
1289  case 4:
1290  return &AMDGPU::VReg_128RegClass;
1291  }
1292  }
1293 }
1294 
1296 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1297  MachineBasicBlock *MBB = CI.I->getParent();
1298  DebugLoc DL = CI.I->getDebugLoc();
1299 
1300  const unsigned Opcode = getNewOpcode(CI);
1301 
1302  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1303  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1304  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1305 
1306  // Copy to the new source register.
1307  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1308  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1309 
1310  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1311  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1312 
1313  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1314  .add(*Src0)
1315  .addImm(SubRegIdx0)
1316  .add(*Src1)
1317  .addImm(SubRegIdx1);
1318 
1319  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1320  .addReg(SrcReg, RegState::Kill);
1321 
1322  const unsigned Regs = getRegs(Opcode, *TII);
1323 
1324  if (Regs & VADDR)
1325  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1326 
1327 
1328  // It shouldn't be possible to get this far if the two instructions
1329  // don't have a single memoperand, because MachineInstr::mayAlias()
1330  // will return true if this is the case.
1331  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1332 
1333  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1334  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1335 
1336  MachineInstr *New =
1337  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1338  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1339  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1340  .addImm(CI.GLC0) // glc
1341  .addImm(CI.SLC0) // slc
1342  .addImm(0) // tfe
1343  .addImm(CI.DLC0) // dlc
1344  .addImm(0) // swz
1345  .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1346 
1347  moveInstsAfter(MIB, CI.InstsToMove);
1348 
1349  CI.I->eraseFromParent();
1350  CI.Paired->eraseFromParent();
1351  return New;
1352 }
1353 
1355 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1356  APInt V(32, Val, true);
1357  if (TII->isInlineConstant(V))
1358  return MachineOperand::CreateImm(Val);
1359 
1360  Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1361  MachineInstr *Mov =
1362  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1363  TII->get(AMDGPU::S_MOV_B32), Reg)
1364  .addImm(Val);
1365  (void)Mov;
1366  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1367  return MachineOperand::CreateReg(Reg, false);
1368 }
1369 
1370 // Compute base address using Addr and return the final register.
1371 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1372  const MemAddress &Addr) const {
1373  MachineBasicBlock *MBB = MI.getParent();
1375  DebugLoc DL = MI.getDebugLoc();
1376 
1377  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1378  Addr.Base.LoSubReg) &&
1379  "Expected 32-bit Base-Register-Low!!");
1380 
1381  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1382  Addr.Base.HiSubReg) &&
1383  "Expected 32-bit Base-Register-Hi!!");
1384 
1385  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1386  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1387  MachineOperand OffsetHi =
1388  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1389 
1390  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1391  Register CarryReg = MRI->createVirtualRegister(CarryRC);
1392  Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1393 
1394  Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1395  Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1396  MachineInstr *LoHalf =
1397  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1398  .addReg(CarryReg, RegState::Define)
1399  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1400  .add(OffsetLo)
1401  .addImm(0); // clamp bit
1402  (void)LoHalf;
1403  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1404 
1405  MachineInstr *HiHalf =
1406  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1407  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1408  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1409  .add(OffsetHi)
1410  .addReg(CarryReg, RegState::Kill)
1411  .addImm(0); // clamp bit
1412  (void)HiHalf;
1413  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1414 
1415  Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1416  MachineInstr *FullBase =
1417  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1418  .addReg(DestSub0)
1419  .addImm(AMDGPU::sub0)
1420  .addReg(DestSub1)
1421  .addImm(AMDGPU::sub1);
1422  (void)FullBase;
1423  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1424 
1425  return FullDestReg;
1426 }
1427 
1428 // Update base and offset with the NewBase and NewOffset in MI.
1429 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1430  unsigned NewBase,
1431  int32_t NewOffset) const {
1432  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1433  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1434 }
1435 
1437 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1438  if (Op.isImm())
1439  return Op.getImm();
1440 
1441  if (!Op.isReg())
1442  return None;
1443 
1444  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1445  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1446  !Def->getOperand(1).isImm())
1447  return None;
1448 
1449  return Def->getOperand(1).getImm();
1450 }
1451 
1452 // Analyze Base and extracts:
1453 // - 32bit base registers, subregisters
1454 // - 64bit constant offset
1455 // Expecting base computation as:
1456 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1457 // %LO:vgpr_32, %c:sreg_64_xexec =
1458 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1459 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1460 // %Base:vreg_64 =
1461 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1462 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1463  MemAddress &Addr) const {
1464  if (!Base.isReg())
1465  return;
1466 
1467  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1468  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1469  || Def->getNumOperands() != 5)
1470  return;
1471 
1472  MachineOperand BaseLo = Def->getOperand(1);
1473  MachineOperand BaseHi = Def->getOperand(3);
1474  if (!BaseLo.isReg() || !BaseHi.isReg())
1475  return;
1476 
1477  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1478  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1479 
1480  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1481  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1482  return;
1483 
1484  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1485  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1486 
1487  auto Offset0P = extractConstOffset(*Src0);
1488  if (Offset0P)
1489  BaseLo = *Src1;
1490  else {
1491  if (!(Offset0P = extractConstOffset(*Src1)))
1492  return;
1493  BaseLo = *Src0;
1494  }
1495 
1496  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1497  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1498 
1499  if (Src0->isImm())
1500  std::swap(Src0, Src1);
1501 
1502  if (!Src1->isImm())
1503  return;
1504 
1505  uint64_t Offset1 = Src1->getImm();
1506  BaseHi = *Src0;
1507 
1508  Addr.Base.LoReg = BaseLo.getReg();
1509  Addr.Base.HiReg = BaseHi.getReg();
1510  Addr.Base.LoSubReg = BaseLo.getSubReg();
1511  Addr.Base.HiSubReg = BaseHi.getSubReg();
1512  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1513 }
1514 
1515 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1516  MachineInstr &MI,
1517  MemInfoMap &Visited,
1518  SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1519 
1520  if (!(MI.mayLoad() ^ MI.mayStore()))
1521  return false;
1522 
1523  // TODO: Support flat and scratch.
1524  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1525  return false;
1526 
1527  if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1528  return false;
1529 
1530  if (AnchorList.count(&MI))
1531  return false;
1532 
1533  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1534 
1535  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1536  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1537  return false;
1538  }
1539 
1540  // Step1: Find the base-registers and a 64bit constant offset.
1541  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1542  MemAddress MAddr;
1543  if (Visited.find(&MI) == Visited.end()) {
1544  processBaseWithConstOffset(Base, MAddr);
1545  Visited[&MI] = MAddr;
1546  } else
1547  MAddr = Visited[&MI];
1548 
1549  if (MAddr.Offset == 0) {
1550  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1551  " constant offsets that can be promoted.\n";);
1552  return false;
1553  }
1554 
1555  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1556  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1557 
1558  // Step2: Traverse through MI's basic block and find an anchor(that has the
1559  // same base-registers) with the highest 13bit distance from MI's offset.
1560  // E.g. (64bit loads)
1561  // bb:
1562  // addr1 = &a + 4096; load1 = load(addr1, 0)
1563  // addr2 = &a + 6144; load2 = load(addr2, 0)
1564  // addr3 = &a + 8192; load3 = load(addr3, 0)
1565  // addr4 = &a + 10240; load4 = load(addr4, 0)
1566  // addr5 = &a + 12288; load5 = load(addr5, 0)
1567  //
1568  // Starting from the first load, the optimization will try to find a new base
1569  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1570  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1571  // as the new-base(anchor) because of the maximum distance which can
1572  // accomodate more intermediate bases presumeably.
1573  //
1574  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1575  // (&a + 8192) for load1, load2, load4.
1576  // addr = &a + 8192
1577  // load1 = load(addr, -4096)
1578  // load2 = load(addr, -2048)
1579  // load3 = load(addr, 0)
1580  // load4 = load(addr, 2048)
1581  // addr5 = &a + 12288; load5 = load(addr5, 0)
1582  //
1583  MachineInstr *AnchorInst = nullptr;
1584  MemAddress AnchorAddr;
1585  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1587 
1588  MachineBasicBlock *MBB = MI.getParent();
1591  ++MBBI;
1592  const SITargetLowering *TLI =
1593  static_cast<const SITargetLowering *>(STM->getTargetLowering());
1594 
1595  for ( ; MBBI != E; ++MBBI) {
1596  MachineInstr &MINext = *MBBI;
1597  // TODO: Support finding an anchor(with same base) from store addresses or
1598  // any other load addresses where the opcodes are different.
1599  if (MINext.getOpcode() != MI.getOpcode() ||
1600  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1601  continue;
1602 
1603  const MachineOperand &BaseNext =
1604  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1605  MemAddress MAddrNext;
1606  if (Visited.find(&MINext) == Visited.end()) {
1607  processBaseWithConstOffset(BaseNext, MAddrNext);
1608  Visited[&MINext] = MAddrNext;
1609  } else
1610  MAddrNext = Visited[&MINext];
1611 
1612  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1613  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1614  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1615  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1616  continue;
1617 
1618  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1619 
1620  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1622  AM.HasBaseReg = true;
1623  AM.BaseOffs = Dist;
1624  if (TLI->isLegalGlobalAddressingMode(AM) &&
1625  (uint32_t)std::abs(Dist) > MaxDist) {
1626  MaxDist = std::abs(Dist);
1627 
1628  AnchorAddr = MAddrNext;
1629  AnchorInst = &MINext;
1630  }
1631  }
1632 
1633  if (AnchorInst) {
1634  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1635  AnchorInst->dump());
1636  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1637  << AnchorAddr.Offset << "\n\n");
1638 
1639  // Instead of moving up, just re-compute anchor-instruction's base address.
1640  unsigned Base = computeBase(MI, AnchorAddr);
1641 
1642  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1643  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1644 
1645  for (auto P : InstsWCommonBase) {
1647  AM.HasBaseReg = true;
1648  AM.BaseOffs = P.second - AnchorAddr.Offset;
1649 
1650  if (TLI->isLegalGlobalAddressingMode(AM)) {
1651  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1652  dbgs() << ")"; P.first->dump());
1653  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1654  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1655  }
1656  }
1657  AnchorList.insert(AnchorInst);
1658  return true;
1659  }
1660 
1661  return false;
1662 }
1663 
1664 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1665  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1666  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1667  if (AddrList.front().hasSameBaseAddress(*CI.I) &&
1668  AddrList.front().InstClass == CI.InstClass) {
1669  AddrList.emplace_back(CI);
1670  return;
1671  }
1672  }
1673 
1674  // Base address not found, so add a new list.
1675  MergeableInsts.emplace_back(1, CI);
1676 }
1677 
1678 bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
1679  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1680  bool Modified = false;
1681  // Contain the list
1682  MemInfoMap Visited;
1683  // Contains the list of instructions for which constant offsets are being
1684  // promoted to the IMM.
1685  SmallPtrSet<MachineInstr *, 4> AnchorList;
1686 
1687  // Sort potential mergeable instructions into lists. One list per base address.
1688  for (MachineInstr &MI : MBB.instrs()) {
1689  // We run this before checking if an address is mergeable, because it can produce
1690  // better code even if the instructions aren't mergeable.
1691  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1692  Modified = true;
1693 
1694  const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
1695  if (InstClass == UNKNOWN)
1696  continue;
1697 
1698  // Don't combine if volatile.
1699  if (MI.hasOrderedMemoryRef())
1700  continue;
1701 
1702  CombineInfo CI;
1703  CI.setMI(MI, *TII, *STM);
1704 
1705  if (!CI.hasMergeableAddress(*MRI))
1706  continue;
1707 
1708  addInstToMergeableList(CI, MergeableInsts);
1709  }
1710  return Modified;
1711 }
1712 
1713 // Scan through looking for adjacent LDS operations with constant offsets from
1714 // the same base register. We rely on the scheduler to do the hard work of
1715 // clustering nearby loads, and assume these are all adjacent.
1716 bool SILoadStoreOptimizer::optimizeBlock(
1717  std::list<std::list<CombineInfo> > &MergeableInsts) {
1718  bool Modified = false;
1719 
1720  for (std::list<CombineInfo> &MergeList : MergeableInsts) {
1721  if (MergeList.size() < 2)
1722  continue;
1723 
1724  bool OptimizeListAgain = false;
1725  if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
1726  // We weren't able to make any changes, so clear the list so we don't
1727  // process the same instructions the next time we try to optimize this
1728  // block.
1729  MergeList.clear();
1730  continue;
1731  }
1732 
1733  // We made changes, but also determined that there were no more optimization
1734  // opportunities, so we don't need to reprocess the list
1735  if (!OptimizeListAgain)
1736  MergeList.clear();
1737 
1738  OptimizeAgain |= OptimizeListAgain;
1739  Modified = true;
1740  }
1741  return Modified;
1742 }
1743 
1744 void
1745 SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
1746  const MachineInstr &MI) {
1747 
1748  for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
1749  if (&*CI->I == &MI) {
1750  MergeList.erase(CI);
1751  return;
1752  }
1753  }
1754 }
1755 
1756 bool
1757 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
1758  std::list<CombineInfo> &MergeList,
1759  bool &OptimizeListAgain) {
1760  bool Modified = false;
1761  for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
1762  CombineInfo &CI = *I;
1763 
1764  switch (CI.InstClass) {
1765  default:
1766  break;
1767  case DS_READ:
1768  if (findMatchingInst(CI)) {
1769  Modified = true;
1770  removeCombinedInst(MergeList, *CI.Paired);
1771  MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
1772  CI.setMI(NewMI, *TII, *STM);
1773  }
1774  break;
1775  case DS_WRITE:
1776  if (findMatchingInst(CI)) {
1777  Modified = true;
1778  removeCombinedInst(MergeList, *CI.Paired);
1779  MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
1780  CI.setMI(NewMI, *TII, *STM);
1781  }
1782  break;
1783  case S_BUFFER_LOAD_IMM:
1784  if (findMatchingInst(CI)) {
1785  Modified = true;
1786  removeCombinedInst(MergeList, *CI.Paired);
1787  MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
1788  CI.setMI(NewMI, *TII, *STM);
1789  OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
1790  }
1791  break;
1792  case BUFFER_LOAD:
1793  if (findMatchingInst(CI)) {
1794  Modified = true;
1795  removeCombinedInst(MergeList, *CI.Paired);
1796  MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
1797  CI.setMI(NewMI, *TII, *STM);
1798  OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1799  }
1800  break;
1801  case BUFFER_STORE:
1802  if (findMatchingInst(CI)) {
1803  Modified = true;
1804  removeCombinedInst(MergeList, *CI.Paired);
1805  MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
1806  CI.setMI(NewMI, *TII, *STM);
1807  OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1808  }
1809  break;
1810  case MIMG:
1811  if (findMatchingInst(CI)) {
1812  Modified = true;
1813  removeCombinedInst(MergeList, *CI.Paired);
1814  MachineBasicBlock::iterator NewMI = mergeImagePair(CI);
1815  CI.setMI(NewMI, *TII, *STM);
1816  OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1817  }
1818  break;
1819  }
1820  // Clear the InstsToMove after we have finished searching so we don't have
1821  // stale values left over if we search for this CI again in another pass
1822  // over the block.
1823  CI.InstsToMove.clear();
1824  }
1825 
1826  return Modified;
1827 }
1828 
1829 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1830  if (skipFunction(MF.getFunction()))
1831  return false;
1832 
1833  STM = &MF.getSubtarget<GCNSubtarget>();
1834  if (!STM->loadStoreOptEnabled())
1835  return false;
1836 
1837  TII = STM->getInstrInfo();
1838  TRI = &TII->getRegisterInfo();
1839 
1840  MRI = &MF.getRegInfo();
1841  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1842 
1843  assert(MRI->isSSA() && "Must be run on SSA");
1844 
1845  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1846 
1847  bool Modified = false;
1848 
1849 
1850  for (MachineBasicBlock &MBB : MF) {
1851  std::list<std::list<CombineInfo> > MergeableInsts;
1852  // First pass: Collect list of all instructions we know how to merge.
1853  Modified |= collectMergeableInsts(MBB, MergeableInsts);
1854  do {
1855  OptimizeAgain = false;
1856  Modified |= optimizeBlock(MergeableInsts);
1857  } while (OptimizeAgain);
1858  }
1859 
1860  return Modified;
1861 }
const MachineInstrBuilder & add(const MachineOperand &MO) const
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Interface definition for SIRegisterInfo.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
bool getMUBUFHasSrsrc(unsigned Opc)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
char & SILoadStoreOptimizerID
void push_back(const T &Elt)
Definition: SmallVector.h:211
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:384
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:63
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:179
unsigned Reg
unsigned getSubReg() const
uint64_t getSize() const
Return the size in bytes of the memory reference.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:853
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
#define op(i)
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:476
bool isInlineConstant(const APInt &Imm) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
const SIRegisterInfo & getRegisterInfo() const
Definition: SIInstrInfo.h:171
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0&#39;s from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:156
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
A description of a memory reference used in the backend.
bool hasDwordx3LoadStores() const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:413
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:487
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:479
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i...
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:410
int getMUBUFElements(unsigned Opc)
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, AliasAnalysis *AA)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:443
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
void setReg(Register Reg)
Change the register this operand corresponds to.
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, AliasAnalysis *AA)
Analysis containing CSE Info
Definition: CSEInfo.cpp:20
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:843
#define P(N)
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:379
unsigned const MachineRegisterInfo * MRI
bool getMUBUFHasSoffset(unsigned Opc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:187
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
int getMUBUFBaseOpcode(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Represent the analysis usage information of a pass.
SI Load Store Optimizer
void setOffset(int64_t NewOffset)
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
self_iterator getIterator()
Definition: ilist_node.h:81
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static MachineMemOperand * combineKnownAdjacentMMOs(MachineFunction &MF, const MachineMemOperand *A, const MachineMemOperand *B)
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Definition: MathExtras.h:556
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:390
static uint64_t add(uint64_t LeftOp, uint64_t RightOp)
Definition: FileCheck.cpp:215
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg) const
Return a partially built integer add instruction without carry.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
Class for arbitrary precision integers.
Definition: APInt.h:69
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:255
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool loadStoreOptEnabled() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static MachineOperand CreateImm(int64_t Val)
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1228
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
uint32_t Size
Definition: Profile.cpp:46
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:91
bool hasOneNonDBGUse(unsigned RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register...
bool isReg() const
isReg - Tests if this is a MO_Register operand.
FunctionPass * createSILoadStoreOptimizerPass()
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be iniitalized. ...
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:830
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
bool getMUBUFHasVAddr(unsigned Opc)
Register getReg() const
getReg - Returns the register number.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const SITargetLowering * getTargetLowering() const override
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:415
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19