LLVM 19.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "AMDGPU.h"
61#include "GCNSubtarget.h"
66
67using namespace llvm;
68
69#define DEBUG_TYPE "si-load-store-opt"
70
71namespace {
72enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 S_BUFFER_LOAD_SGPR_IMM,
78 S_LOAD_IMM,
79 BUFFER_LOAD,
80 BUFFER_STORE,
81 MIMG,
82 TBUFFER_LOAD,
83 TBUFFER_STORE,
84 GLOBAL_LOAD_SADDR,
85 GLOBAL_STORE_SADDR,
86 FLAT_LOAD,
87 FLAT_STORE,
88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89 GLOBAL_STORE // any CombineInfo, they are only ever returned by
90 // getCommonInstClass.
91};
92
93struct AddressRegs {
94 unsigned char NumVAddrs = 0;
95 bool SBase = false;
96 bool SRsrc = false;
97 bool SOffset = false;
98 bool SAddr = false;
99 bool VAddr = false;
100 bool Addr = false;
101 bool SSamp = false;
102};
103
104// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105const unsigned MaxAddressRegs = 12 + 1 + 1;
106
107class SILoadStoreOptimizer : public MachineFunctionPass {
108 struct CombineInfo {
110 unsigned EltSize;
111 unsigned Offset;
112 unsigned Width;
113 unsigned Format;
114 unsigned BaseOff;
115 unsigned DMask;
116 InstClassEnum InstClass;
117 unsigned CPol = 0;
118 bool IsAGPR;
119 bool UseST64;
120 int AddrIdx[MaxAddressRegs];
121 const MachineOperand *AddrReg[MaxAddressRegs];
122 unsigned NumAddresses;
123 unsigned Order;
124
125 bool hasSameBaseAddress(const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
127 return false;
128
129 const MachineInstr &MI = *CI.I;
130 for (unsigned i = 0; i < NumAddresses; i++) {
131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132
133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136 return false;
137 }
138 continue;
139 }
140
141 // Check same base pointer. Be careful of subregisters, which can occur
142 // with vectors of pointers.
143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145 return false;
146 }
147 }
148 return true;
149 }
150
151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152 for (unsigned i = 0; i < NumAddresses; ++i) {
153 const MachineOperand *AddrOp = AddrReg[i];
154 // Immediates are always OK.
155 if (AddrOp->isImm())
156 continue;
157
158 // Don't try to merge addresses that aren't either immediates or registers.
159 // TODO: Should be possible to merge FrameIndexes and maybe some other
160 // non-register
161 if (!AddrOp->isReg())
162 return false;
163
164 // TODO: We should be able to merge instructions with other physical reg
165 // addresses too.
166 if (AddrOp->getReg().isPhysical() &&
167 AddrOp->getReg() != AMDGPU::SGPR_NULL)
168 return false;
169
170 // If an address has only one use then there will be no other
171 // instructions with the same address, so we can't merge this one.
172 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173 return false;
174 }
175 return true;
176 }
177
178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179
180 // Compare by pointer order.
181 bool operator<(const CombineInfo& Other) const {
182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183 }
184 };
185
186 struct BaseRegisters {
187 Register LoReg;
188 Register HiReg;
189
190 unsigned LoSubReg = 0;
191 unsigned HiSubReg = 0;
192 };
193
194 struct MemAddress {
195 BaseRegisters Base;
196 int64_t Offset = 0;
197 };
198
199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200
201private:
202 const GCNSubtarget *STM = nullptr;
203 const SIInstrInfo *TII = nullptr;
204 const SIRegisterInfo *TRI = nullptr;
205 MachineRegisterInfo *MRI = nullptr;
206 AliasAnalysis *AA = nullptr;
207 bool OptimizeAgain;
208
209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210 const DenseSet<Register> &ARegUses,
211 const MachineInstr &A, const MachineInstr &B) const;
212 static bool dmasksCanBeCombined(const CombineInfo &CI,
213 const SIInstrInfo &TII,
214 const CombineInfo &Paired);
215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216 CombineInfo &Paired, bool Modify = false);
217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218 const CombineInfo &Paired);
219 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221 const CombineInfo &Paired);
222 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
223 const CombineInfo &Paired);
224 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
225
226 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
227
228 unsigned read2Opcode(unsigned EltSize) const;
229 unsigned read2ST64Opcode(unsigned EltSize) const;
231 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
232 MachineBasicBlock::iterator InsertBefore);
233
234 unsigned write2Opcode(unsigned EltSize) const;
235 unsigned write2ST64Opcode(unsigned EltSize) const;
237 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238 MachineBasicBlock::iterator InsertBefore);
240 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241 MachineBasicBlock::iterator InsertBefore);
243 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244 MachineBasicBlock::iterator InsertBefore);
246 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247 MachineBasicBlock::iterator InsertBefore);
249 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250 MachineBasicBlock::iterator InsertBefore);
252 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253 MachineBasicBlock::iterator InsertBefore);
255 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256 MachineBasicBlock::iterator InsertBefore);
258 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
259 MachineBasicBlock::iterator InsertBefore);
261 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
262 MachineBasicBlock::iterator InsertBefore);
263
264 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
265 int32_t NewOffset) const;
266 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
267 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
268 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
269 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
270 /// Promotes constant offset to the immediate by adjusting the base. It
271 /// tries to use a base from the nearby instructions that allows it to have
272 /// a 13bit constant offset which gets promoted to the immediate.
273 bool promoteConstantOffsetToImm(MachineInstr &CI,
274 MemInfoMap &Visited,
275 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
276 void addInstToMergeableList(const CombineInfo &CI,
277 std::list<std::list<CombineInfo> > &MergeableInsts) const;
278
279 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
281 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
282 std::list<std::list<CombineInfo>> &MergeableInsts) const;
283
284 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
285 const CombineInfo &Paired);
286
287 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
288 const CombineInfo &Paired);
289
290public:
291 static char ID;
292
293 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
295 }
296
297 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
298 bool &OptimizeListAgain);
299 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
300
301 bool runOnMachineFunction(MachineFunction &MF) override;
302
303 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
304
305 void getAnalysisUsage(AnalysisUsage &AU) const override {
306 AU.setPreservesCFG();
308
310 }
311
312 MachineFunctionProperties getRequiredProperties() const override {
314 .set(MachineFunctionProperties::Property::IsSSA);
315 }
316};
317
318static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
319 const unsigned Opc = MI.getOpcode();
320
321 if (TII.isMUBUF(Opc)) {
322 // FIXME: Handle d16 correctly
323 return AMDGPU::getMUBUFElements(Opc);
324 }
325 if (TII.isImage(MI)) {
326 uint64_t DMaskImm =
327 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
328 return llvm::popcount(DMaskImm);
329 }
330 if (TII.isMTBUF(Opc)) {
331 return AMDGPU::getMTBUFElements(Opc);
332 }
333
334 switch (Opc) {
335 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337 case AMDGPU::S_LOAD_DWORD_IMM:
338 case AMDGPU::GLOBAL_LOAD_DWORD:
339 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
340 case AMDGPU::GLOBAL_STORE_DWORD:
341 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
342 case AMDGPU::FLAT_LOAD_DWORD:
343 case AMDGPU::FLAT_STORE_DWORD:
344 return 1;
345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347 case AMDGPU::S_LOAD_DWORDX2_IMM:
348 case AMDGPU::GLOBAL_LOAD_DWORDX2:
349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350 case AMDGPU::GLOBAL_STORE_DWORDX2:
351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352 case AMDGPU::FLAT_LOAD_DWORDX2:
353 case AMDGPU::FLAT_STORE_DWORDX2:
354 return 2;
355 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357 case AMDGPU::S_LOAD_DWORDX3_IMM:
358 case AMDGPU::GLOBAL_LOAD_DWORDX3:
359 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
360 case AMDGPU::GLOBAL_STORE_DWORDX3:
361 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
362 case AMDGPU::FLAT_LOAD_DWORDX3:
363 case AMDGPU::FLAT_STORE_DWORDX3:
364 return 3;
365 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367 case AMDGPU::S_LOAD_DWORDX4_IMM:
368 case AMDGPU::GLOBAL_LOAD_DWORDX4:
369 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
370 case AMDGPU::GLOBAL_STORE_DWORDX4:
371 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
372 case AMDGPU::FLAT_LOAD_DWORDX4:
373 case AMDGPU::FLAT_STORE_DWORDX4:
374 return 4;
375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377 case AMDGPU::S_LOAD_DWORDX8_IMM:
378 return 8;
379 case AMDGPU::DS_READ_B32: [[fallthrough]];
380 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
381 case AMDGPU::DS_WRITE_B32: [[fallthrough]];
382 case AMDGPU::DS_WRITE_B32_gfx9:
383 return 1;
384 case AMDGPU::DS_READ_B64: [[fallthrough]];
385 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
386 case AMDGPU::DS_WRITE_B64: [[fallthrough]];
387 case AMDGPU::DS_WRITE_B64_gfx9:
388 return 2;
389 default:
390 return 0;
391 }
392}
393
394/// Maps instruction opcode to enum InstClassEnum.
395static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
396 switch (Opc) {
397 default:
398 if (TII.isMUBUF(Opc)) {
399 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
400 default:
401 return UNKNOWN;
402 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
403 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
404 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
405 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
406 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
407 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
408 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
409 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
410 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
411 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
412 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
413 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
414 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
415 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
416 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
417 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
418 return BUFFER_LOAD;
419 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
420 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
421 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
422 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
423 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
424 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
425 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
426 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
427 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
428 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
429 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
430 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
431 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
432 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
433 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
434 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
435 return BUFFER_STORE;
436 }
437 }
438 if (TII.isImage(Opc)) {
439 // Ignore instructions encoded without vaddr.
440 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
441 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
442 return UNKNOWN;
443 // Ignore BVH instructions
444 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
445 return UNKNOWN;
446 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
447 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
448 TII.isGather4(Opc))
449 return UNKNOWN;
450 return MIMG;
451 }
452 if (TII.isMTBUF(Opc)) {
453 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
454 default:
455 return UNKNOWN;
456 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
457 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
458 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
459 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
460 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
461 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
462 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
463 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
464 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
465 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
466 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
467 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
468 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
469 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
470 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
471 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
472 return TBUFFER_LOAD;
473 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
474 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
475 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
476 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
477 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
478 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
479 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
480 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
481 return TBUFFER_STORE;
482 }
483 }
484 return UNKNOWN;
485 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
486 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
487 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
488 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
489 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
490 return S_BUFFER_LOAD_IMM;
491 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
492 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
493 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
494 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
495 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
496 return S_BUFFER_LOAD_SGPR_IMM;
497 case AMDGPU::S_LOAD_DWORD_IMM:
498 case AMDGPU::S_LOAD_DWORDX2_IMM:
499 case AMDGPU::S_LOAD_DWORDX3_IMM:
500 case AMDGPU::S_LOAD_DWORDX4_IMM:
501 case AMDGPU::S_LOAD_DWORDX8_IMM:
502 return S_LOAD_IMM;
503 case AMDGPU::DS_READ_B32:
504 case AMDGPU::DS_READ_B32_gfx9:
505 case AMDGPU::DS_READ_B64:
506 case AMDGPU::DS_READ_B64_gfx9:
507 return DS_READ;
508 case AMDGPU::DS_WRITE_B32:
509 case AMDGPU::DS_WRITE_B32_gfx9:
510 case AMDGPU::DS_WRITE_B64:
511 case AMDGPU::DS_WRITE_B64_gfx9:
512 return DS_WRITE;
513 case AMDGPU::GLOBAL_LOAD_DWORD:
514 case AMDGPU::GLOBAL_LOAD_DWORDX2:
515 case AMDGPU::GLOBAL_LOAD_DWORDX3:
516 case AMDGPU::GLOBAL_LOAD_DWORDX4:
517 case AMDGPU::FLAT_LOAD_DWORD:
518 case AMDGPU::FLAT_LOAD_DWORDX2:
519 case AMDGPU::FLAT_LOAD_DWORDX3:
520 case AMDGPU::FLAT_LOAD_DWORDX4:
521 return FLAT_LOAD;
522 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
523 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
524 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
525 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
526 return GLOBAL_LOAD_SADDR;
527 case AMDGPU::GLOBAL_STORE_DWORD:
528 case AMDGPU::GLOBAL_STORE_DWORDX2:
529 case AMDGPU::GLOBAL_STORE_DWORDX3:
530 case AMDGPU::GLOBAL_STORE_DWORDX4:
531 case AMDGPU::FLAT_STORE_DWORD:
532 case AMDGPU::FLAT_STORE_DWORDX2:
533 case AMDGPU::FLAT_STORE_DWORDX3:
534 case AMDGPU::FLAT_STORE_DWORDX4:
535 return FLAT_STORE;
536 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
537 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
538 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
539 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
540 return GLOBAL_STORE_SADDR;
541 }
542}
543
544/// Determines instruction subclass from opcode. Only instructions
545/// of the same subclass can be merged together. The merged instruction may have
546/// a different subclass but must have the same class.
547static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
548 switch (Opc) {
549 default:
550 if (TII.isMUBUF(Opc))
551 return AMDGPU::getMUBUFBaseOpcode(Opc);
552 if (TII.isImage(Opc)) {
554 assert(Info);
555 return Info->BaseOpcode;
556 }
557 if (TII.isMTBUF(Opc))
558 return AMDGPU::getMTBUFBaseOpcode(Opc);
559 return -1;
560 case AMDGPU::DS_READ_B32:
561 case AMDGPU::DS_READ_B32_gfx9:
562 case AMDGPU::DS_READ_B64:
563 case AMDGPU::DS_READ_B64_gfx9:
564 case AMDGPU::DS_WRITE_B32:
565 case AMDGPU::DS_WRITE_B32_gfx9:
566 case AMDGPU::DS_WRITE_B64:
567 case AMDGPU::DS_WRITE_B64_gfx9:
568 return Opc;
569 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
570 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
571 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
572 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
573 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
574 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
575 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
576 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
577 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
578 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
579 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
580 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
581 case AMDGPU::S_LOAD_DWORD_IMM:
582 case AMDGPU::S_LOAD_DWORDX2_IMM:
583 case AMDGPU::S_LOAD_DWORDX3_IMM:
584 case AMDGPU::S_LOAD_DWORDX4_IMM:
585 case AMDGPU::S_LOAD_DWORDX8_IMM:
586 return AMDGPU::S_LOAD_DWORD_IMM;
587 case AMDGPU::GLOBAL_LOAD_DWORD:
588 case AMDGPU::GLOBAL_LOAD_DWORDX2:
589 case AMDGPU::GLOBAL_LOAD_DWORDX3:
590 case AMDGPU::GLOBAL_LOAD_DWORDX4:
591 case AMDGPU::FLAT_LOAD_DWORD:
592 case AMDGPU::FLAT_LOAD_DWORDX2:
593 case AMDGPU::FLAT_LOAD_DWORDX3:
594 case AMDGPU::FLAT_LOAD_DWORDX4:
595 return AMDGPU::FLAT_LOAD_DWORD;
596 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
597 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
598 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
599 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
600 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
601 case AMDGPU::GLOBAL_STORE_DWORD:
602 case AMDGPU::GLOBAL_STORE_DWORDX2:
603 case AMDGPU::GLOBAL_STORE_DWORDX3:
604 case AMDGPU::GLOBAL_STORE_DWORDX4:
605 case AMDGPU::FLAT_STORE_DWORD:
606 case AMDGPU::FLAT_STORE_DWORDX2:
607 case AMDGPU::FLAT_STORE_DWORDX3:
608 case AMDGPU::FLAT_STORE_DWORDX4:
609 return AMDGPU::FLAT_STORE_DWORD;
610 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
611 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
612 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
613 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
614 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
615 }
616}
617
618// GLOBAL loads and stores are classified as FLAT initially. If both combined
619// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
620// If either or both instructions are non segment specific FLAT the resulting
621// combined operation will be FLAT, potentially promoting one of the GLOBAL
622// operations to FLAT.
623// For other instructions return the original unmodified class.
624InstClassEnum
625SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
626 const CombineInfo &Paired) {
627 assert(CI.InstClass == Paired.InstClass);
628
629 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
631 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
632
633 return CI.InstClass;
634}
635
636static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
637 AddressRegs Result;
638
639 if (TII.isMUBUF(Opc)) {
641 Result.VAddr = true;
643 Result.SRsrc = true;
645 Result.SOffset = true;
646
647 return Result;
648 }
649
650 if (TII.isImage(Opc)) {
651 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
652 if (VAddr0Idx >= 0) {
653 int RsrcName =
654 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
655 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
656 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
657 } else {
658 Result.VAddr = true;
659 }
660 Result.SRsrc = true;
662 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
663 Result.SSamp = true;
664
665 return Result;
666 }
667 if (TII.isMTBUF(Opc)) {
669 Result.VAddr = true;
671 Result.SRsrc = true;
673 Result.SOffset = true;
674
675 return Result;
676 }
677
678 switch (Opc) {
679 default:
680 return Result;
681 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
682 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
683 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
684 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
685 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
686 Result.SOffset = true;
687 [[fallthrough]];
688 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
689 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
690 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
691 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
692 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
693 case AMDGPU::S_LOAD_DWORD_IMM:
694 case AMDGPU::S_LOAD_DWORDX2_IMM:
695 case AMDGPU::S_LOAD_DWORDX3_IMM:
696 case AMDGPU::S_LOAD_DWORDX4_IMM:
697 case AMDGPU::S_LOAD_DWORDX8_IMM:
698 Result.SBase = true;
699 return Result;
700 case AMDGPU::DS_READ_B32:
701 case AMDGPU::DS_READ_B64:
702 case AMDGPU::DS_READ_B32_gfx9:
703 case AMDGPU::DS_READ_B64_gfx9:
704 case AMDGPU::DS_WRITE_B32:
705 case AMDGPU::DS_WRITE_B64:
706 case AMDGPU::DS_WRITE_B32_gfx9:
707 case AMDGPU::DS_WRITE_B64_gfx9:
708 Result.Addr = true;
709 return Result;
710 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
711 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
712 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
713 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
714 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
715 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
716 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
717 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
718 Result.SAddr = true;
719 [[fallthrough]];
720 case AMDGPU::GLOBAL_LOAD_DWORD:
721 case AMDGPU::GLOBAL_LOAD_DWORDX2:
722 case AMDGPU::GLOBAL_LOAD_DWORDX3:
723 case AMDGPU::GLOBAL_LOAD_DWORDX4:
724 case AMDGPU::GLOBAL_STORE_DWORD:
725 case AMDGPU::GLOBAL_STORE_DWORDX2:
726 case AMDGPU::GLOBAL_STORE_DWORDX3:
727 case AMDGPU::GLOBAL_STORE_DWORDX4:
728 case AMDGPU::FLAT_LOAD_DWORD:
729 case AMDGPU::FLAT_LOAD_DWORDX2:
730 case AMDGPU::FLAT_LOAD_DWORDX3:
731 case AMDGPU::FLAT_LOAD_DWORDX4:
732 case AMDGPU::FLAT_STORE_DWORD:
733 case AMDGPU::FLAT_STORE_DWORDX2:
734 case AMDGPU::FLAT_STORE_DWORDX3:
735 case AMDGPU::FLAT_STORE_DWORDX4:
736 Result.VAddr = true;
737 return Result;
738 }
739}
740
741void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
742 const SILoadStoreOptimizer &LSO) {
743 I = MI;
744 unsigned Opc = MI->getOpcode();
745 InstClass = getInstClass(Opc, *LSO.TII);
746
747 if (InstClass == UNKNOWN)
748 return;
749
750 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
751
752 switch (InstClass) {
753 case DS_READ:
754 EltSize =
755 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
756 : 4;
757 break;
758 case DS_WRITE:
759 EltSize =
760 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
761 : 4;
762 break;
763 case S_BUFFER_LOAD_IMM:
764 case S_BUFFER_LOAD_SGPR_IMM:
765 case S_LOAD_IMM:
766 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
767 break;
768 default:
769 EltSize = 4;
770 break;
771 }
772
773 if (InstClass == MIMG) {
774 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
775 // Offset is not considered for MIMG instructions.
776 Offset = 0;
777 } else {
778 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
779 Offset = I->getOperand(OffsetIdx).getImm();
780 }
781
782 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
783 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
784
785 Width = getOpcodeWidth(*I, *LSO.TII);
786
787 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
788 Offset &= 0xffff;
789 } else if (InstClass != MIMG) {
790 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
791 }
792
793 AddressRegs Regs = getRegs(Opc, *LSO.TII);
794 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
795
796 NumAddresses = 0;
797 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
798 AddrIdx[NumAddresses++] =
799 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
800 if (Regs.Addr)
801 AddrIdx[NumAddresses++] =
802 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
803 if (Regs.SBase)
804 AddrIdx[NumAddresses++] =
805 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
806 if (Regs.SRsrc)
807 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
808 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
809 if (Regs.SOffset)
810 AddrIdx[NumAddresses++] =
811 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
812 if (Regs.SAddr)
813 AddrIdx[NumAddresses++] =
814 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
815 if (Regs.VAddr)
816 AddrIdx[NumAddresses++] =
817 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
818 if (Regs.SSamp)
819 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
820 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
821 assert(NumAddresses <= MaxAddressRegs);
822
823 for (unsigned J = 0; J < NumAddresses; J++)
824 AddrReg[J] = &I->getOperand(AddrIdx[J]);
825}
826
827} // end anonymous namespace.
828
829INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
830 "SI Load Store Optimizer", false, false)
832INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
834
835char SILoadStoreOptimizer::ID = 0;
836
837char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
838
840 return new SILoadStoreOptimizer();
841}
842
844 DenseSet<Register> &RegDefs,
845 DenseSet<Register> &RegUses) {
846 for (const auto &Op : MI.operands()) {
847 if (!Op.isReg())
848 continue;
849 if (Op.isDef())
850 RegDefs.insert(Op.getReg());
851 if (Op.readsReg())
852 RegUses.insert(Op.getReg());
853 }
854}
855
856bool SILoadStoreOptimizer::canSwapInstructions(
857 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
858 const MachineInstr &A, const MachineInstr &B) const {
859 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
860 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
861 return false;
862 for (const auto &BOp : B.operands()) {
863 if (!BOp.isReg())
864 continue;
865 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
866 return false;
867 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
868 return false;
869 }
870 return true;
871}
872
873// Given that \p CI and \p Paired are adjacent memory operations produce a new
874// MMO for the combined operation with a new access size.
876SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
877 const CombineInfo &Paired) {
878 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
879 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
880
881 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
882
883 // A base pointer for the combined operation is the same as the leading
884 // operation's pointer.
885 if (Paired < CI)
886 std::swap(MMOa, MMOb);
887
888 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
889 // If merging FLAT and GLOBAL set address space to FLAT.
891 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
892
893 MachineFunction *MF = CI.I->getMF();
894 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
895}
896
897bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
898 const SIInstrInfo &TII,
899 const CombineInfo &Paired) {
900 assert(CI.InstClass == MIMG);
901
902 // Ignore instructions with tfe/lwe set.
903 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
904 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
905
906 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
907 return false;
908
909 // Check other optional immediate operands for equality.
910 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
911 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
912 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
913
914 for (auto op : OperandsToMatch) {
915 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
916 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
917 return false;
918 if (Idx != -1 &&
919 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
920 return false;
921 }
922
923 // Check DMask for overlaps.
924 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
925 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
926
927 if (!MaxMask)
928 return false;
929
930 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
931 if ((1u << AllowedBitsForMin) <= MinMask)
932 return false;
933
934 return true;
935}
936
937static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
938 unsigned ComponentCount,
939 const GCNSubtarget &STI) {
940 if (ComponentCount > 4)
941 return 0;
942
943 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
945 if (!OldFormatInfo)
946 return 0;
947
948 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
950 ComponentCount,
951 OldFormatInfo->NumFormat, STI);
952
953 if (!NewFormatInfo)
954 return 0;
955
956 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
957 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
958
959 return NewFormatInfo->Format;
960}
961
962// Return the value in the inclusive range [Lo,Hi] that is aligned to the
963// highest power of two. Note that the result is well defined for all inputs
964// including corner cases like:
965// - if Lo == Hi, return that value
966// - if Lo == 0, return 0 (even though the "- 1" below underflows
967// - if Lo > Hi, return 0 (as if the range wrapped around)
969 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
970}
971
972bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
973 const GCNSubtarget &STI,
974 CombineInfo &Paired,
975 bool Modify) {
976 assert(CI.InstClass != MIMG);
977
978 // XXX - Would the same offset be OK? Is there any reason this would happen or
979 // be useful?
980 if (CI.Offset == Paired.Offset)
981 return false;
982
983 // This won't be valid if the offset isn't aligned.
984 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
985 return false;
986
987 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
988
991 if (!Info0)
992 return false;
994 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
995 if (!Info1)
996 return false;
997
998 if (Info0->BitsPerComp != Info1->BitsPerComp ||
999 Info0->NumFormat != Info1->NumFormat)
1000 return false;
1001
1002 // TODO: Should be possible to support more formats, but if format loads
1003 // are not dword-aligned, the merged load might not be valid.
1004 if (Info0->BitsPerComp != 32)
1005 return false;
1006
1007 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1008 return false;
1009 }
1010
1011 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1012 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1013 CI.UseST64 = false;
1014 CI.BaseOff = 0;
1015
1016 // Handle all non-DS instructions.
1017 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1018 if (EltOffset0 + CI.Width != EltOffset1 &&
1019 EltOffset1 + Paired.Width != EltOffset0)
1020 return false;
1021 if (CI.CPol != Paired.CPol)
1022 return false;
1023 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1024 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1025 // Reject cases like:
1026 // dword + dwordx2 -> dwordx3
1027 // dword + dwordx3 -> dwordx4
1028 // If we tried to combine these cases, we would fail to extract a subreg
1029 // for the result of the second load due to SGPR alignment requirements.
1030 if (CI.Width != Paired.Width &&
1031 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1032 return false;
1033 }
1034 return true;
1035 }
1036
1037 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1038 // the stride 64 versions.
1039 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1040 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1041 if (Modify) {
1042 CI.Offset = EltOffset0 / 64;
1043 Paired.Offset = EltOffset1 / 64;
1044 CI.UseST64 = true;
1045 }
1046 return true;
1047 }
1048
1049 // Check if the new offsets fit in the reduced 8-bit range.
1050 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1051 if (Modify) {
1052 CI.Offset = EltOffset0;
1053 Paired.Offset = EltOffset1;
1054 }
1055 return true;
1056 }
1057
1058 // Try to shift base address to decrease offsets.
1059 uint32_t Min = std::min(EltOffset0, EltOffset1);
1060 uint32_t Max = std::max(EltOffset0, EltOffset1);
1061
1062 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1063 if (((Max - Min) & ~Mask) == 0) {
1064 if (Modify) {
1065 // From the range of values we could use for BaseOff, choose the one that
1066 // is aligned to the highest power of two, to maximise the chance that
1067 // the same offset can be reused for other load/store pairs.
1068 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1069 // Copy the low bits of the offsets, so that when we adjust them by
1070 // subtracting BaseOff they will be multiples of 64.
1071 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1072 CI.BaseOff = BaseOff * CI.EltSize;
1073 CI.Offset = (EltOffset0 - BaseOff) / 64;
1074 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1075 CI.UseST64 = true;
1076 }
1077 return true;
1078 }
1079
1080 if (isUInt<8>(Max - Min)) {
1081 if (Modify) {
1082 // From the range of values we could use for BaseOff, choose the one that
1083 // is aligned to the highest power of two, to maximise the chance that
1084 // the same offset can be reused for other load/store pairs.
1085 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1086 CI.BaseOff = BaseOff * CI.EltSize;
1087 CI.Offset = EltOffset0 - BaseOff;
1088 Paired.Offset = EltOffset1 - BaseOff;
1089 }
1090 return true;
1091 }
1092
1093 return false;
1094}
1095
1096bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1097 const CombineInfo &CI,
1098 const CombineInfo &Paired) {
1099 const unsigned Width = (CI.Width + Paired.Width);
1100 switch (CI.InstClass) {
1101 default:
1102 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1103 case S_BUFFER_LOAD_IMM:
1104 case S_BUFFER_LOAD_SGPR_IMM:
1105 case S_LOAD_IMM:
1106 switch (Width) {
1107 default:
1108 return false;
1109 case 2:
1110 case 4:
1111 case 8:
1112 return true;
1113 case 3:
1114 return STM.hasScalarDwordx3Loads();
1115 }
1116 }
1117}
1118
1119const TargetRegisterClass *
1120SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1121 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1122 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1123 }
1124 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1125 return TRI->getRegClassForReg(*MRI, Src->getReg());
1126 }
1127 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1128 return TRI->getRegClassForReg(*MRI, Src->getReg());
1129 }
1130 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1131 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1132 }
1133 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1134 return TRI->getRegClassForReg(*MRI, Src->getReg());
1135 }
1136 return nullptr;
1137}
1138
1139/// This function assumes that CI comes before Paired in a basic block. Return
1140/// an insertion point for the merged instruction or nullptr on failure.
1141SILoadStoreOptimizer::CombineInfo *
1142SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1143 CombineInfo &Paired) {
1144 // If another instruction has already been merged into CI, it may now be a
1145 // type that we can't do any further merging into.
1146 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1147 return nullptr;
1148 assert(CI.InstClass == Paired.InstClass);
1149
1150 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1151 getInstSubclass(Paired.I->getOpcode(), *TII))
1152 return nullptr;
1153
1154 // Check both offsets (or masks for MIMG) can be combined and fit in the
1155 // reduced range.
1156 if (CI.InstClass == MIMG) {
1157 if (!dmasksCanBeCombined(CI, *TII, Paired))
1158 return nullptr;
1159 } else {
1160 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1161 return nullptr;
1162 }
1163
1164 DenseSet<Register> RegDefs;
1165 DenseSet<Register> RegUses;
1166 CombineInfo *Where;
1167 if (CI.I->mayLoad()) {
1168 // Try to hoist Paired up to CI.
1169 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1170 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1171 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1172 return nullptr;
1173 }
1174 Where = &CI;
1175 } else {
1176 // Try to sink CI down to Paired.
1177 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1178 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1179 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1180 return nullptr;
1181 }
1182 Where = &Paired;
1183 }
1184
1185 // Call offsetsCanBeCombined with modify = true so that the offsets are
1186 // correct for the new instruction. This should return true, because
1187 // this function should only be called on CombineInfo objects that
1188 // have already been confirmed to be mergeable.
1189 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1190 offsetsCanBeCombined(CI, *STM, Paired, true);
1191 return Where;
1192}
1193
1194unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1195 if (STM->ldsRequiresM0Init())
1196 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1197 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1198}
1199
1200unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1201 if (STM->ldsRequiresM0Init())
1202 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1203
1204 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1205 : AMDGPU::DS_READ2ST64_B64_gfx9;
1206}
1207
1209SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1210 MachineBasicBlock::iterator InsertBefore) {
1211 MachineBasicBlock *MBB = CI.I->getParent();
1212
1213 // Be careful, since the addresses could be subregisters themselves in weird
1214 // cases, like vectors of pointers.
1215 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1216
1217 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1218 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1219
1220 unsigned NewOffset0 = CI.Offset;
1221 unsigned NewOffset1 = Paired.Offset;
1222 unsigned Opc =
1223 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1224
1225 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1226 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1227
1228 if (NewOffset0 > NewOffset1) {
1229 // Canonicalize the merged instruction so the smaller offset comes first.
1230 std::swap(NewOffset0, NewOffset1);
1231 std::swap(SubRegIdx0, SubRegIdx1);
1232 }
1233
1234 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1235 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1236
1237 const MCInstrDesc &Read2Desc = TII->get(Opc);
1238
1239 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1240 Register DestReg = MRI->createVirtualRegister(SuperRC);
1241
1242 DebugLoc DL = CI.I->getDebugLoc();
1243
1244 Register BaseReg = AddrReg->getReg();
1245 unsigned BaseSubReg = AddrReg->getSubReg();
1246 unsigned BaseRegFlags = 0;
1247 if (CI.BaseOff) {
1248 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1249 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1250 .addImm(CI.BaseOff);
1251
1252 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1253 BaseRegFlags = RegState::Kill;
1254
1255 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1256 .addReg(ImmReg)
1257 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1258 .addImm(0); // clamp bit
1259 BaseSubReg = 0;
1260 }
1261
1262 MachineInstrBuilder Read2 =
1263 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1264 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1265 .addImm(NewOffset0) // offset0
1266 .addImm(NewOffset1) // offset1
1267 .addImm(0) // gds
1268 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1269
1270 (void)Read2;
1271
1272 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1273
1274 // Copy to the old destination registers.
1275 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1276 .add(*Dest0) // Copy to same destination including flags and sub reg.
1277 .addReg(DestReg, 0, SubRegIdx0);
1278 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1279 .add(*Dest1)
1280 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1281
1282 CI.I->eraseFromParent();
1283 Paired.I->eraseFromParent();
1284
1285 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1286 return Read2;
1287}
1288
1289unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1290 if (STM->ldsRequiresM0Init())
1291 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1292 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1293 : AMDGPU::DS_WRITE2_B64_gfx9;
1294}
1295
1296unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1297 if (STM->ldsRequiresM0Init())
1298 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1299 : AMDGPU::DS_WRITE2ST64_B64;
1300
1301 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1302 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1303}
1304
1305MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1306 CombineInfo &CI, CombineInfo &Paired,
1307 MachineBasicBlock::iterator InsertBefore) {
1308 MachineBasicBlock *MBB = CI.I->getParent();
1309
1310 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1311 // sure we preserve the subregister index and any register flags set on them.
1312 const MachineOperand *AddrReg =
1313 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1314 const MachineOperand *Data0 =
1315 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1316 const MachineOperand *Data1 =
1317 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1318
1319 unsigned NewOffset0 = CI.Offset;
1320 unsigned NewOffset1 = Paired.Offset;
1321 unsigned Opc =
1322 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1323
1324 if (NewOffset0 > NewOffset1) {
1325 // Canonicalize the merged instruction so the smaller offset comes first.
1326 std::swap(NewOffset0, NewOffset1);
1327 std::swap(Data0, Data1);
1328 }
1329
1330 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1331 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1332
1333 const MCInstrDesc &Write2Desc = TII->get(Opc);
1334 DebugLoc DL = CI.I->getDebugLoc();
1335
1336 Register BaseReg = AddrReg->getReg();
1337 unsigned BaseSubReg = AddrReg->getSubReg();
1338 unsigned BaseRegFlags = 0;
1339 if (CI.BaseOff) {
1340 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1341 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1342 .addImm(CI.BaseOff);
1343
1344 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1345 BaseRegFlags = RegState::Kill;
1346
1347 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1348 .addReg(ImmReg)
1349 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1350 .addImm(0); // clamp bit
1351 BaseSubReg = 0;
1352 }
1353
1354 MachineInstrBuilder Write2 =
1355 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1356 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1357 .add(*Data0) // data0
1358 .add(*Data1) // data1
1359 .addImm(NewOffset0) // offset0
1360 .addImm(NewOffset1) // offset1
1361 .addImm(0) // gds
1362 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1363
1364 CI.I->eraseFromParent();
1365 Paired.I->eraseFromParent();
1366
1367 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1368 return Write2;
1369}
1370
1372SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1373 MachineBasicBlock::iterator InsertBefore) {
1374 MachineBasicBlock *MBB = CI.I->getParent();
1375 DebugLoc DL = CI.I->getDebugLoc();
1376 const unsigned Opcode = getNewOpcode(CI, Paired);
1377
1378 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1379
1380 Register DestReg = MRI->createVirtualRegister(SuperRC);
1381 unsigned MergedDMask = CI.DMask | Paired.DMask;
1382 unsigned DMaskIdx =
1383 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1384
1385 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1386 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1387 if (I == DMaskIdx)
1388 MIB.addImm(MergedDMask);
1389 else
1390 MIB.add((*CI.I).getOperand(I));
1391 }
1392
1393 // It shouldn't be possible to get this far if the two instructions
1394 // don't have a single memoperand, because MachineInstr::mayAlias()
1395 // will return true if this is the case.
1396 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1397
1398 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1399
1400 unsigned SubRegIdx0, SubRegIdx1;
1401 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1402
1403 // Copy to the old destination registers.
1404 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1405 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1406 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1407
1408 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1409 .add(*Dest0) // Copy to same destination including flags and sub reg.
1410 .addReg(DestReg, 0, SubRegIdx0);
1411 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1412 .add(*Dest1)
1413 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1414
1415 CI.I->eraseFromParent();
1416 Paired.I->eraseFromParent();
1417 return New;
1418}
1419
1420MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1421 CombineInfo &CI, CombineInfo &Paired,
1422 MachineBasicBlock::iterator InsertBefore) {
1423 MachineBasicBlock *MBB = CI.I->getParent();
1424 DebugLoc DL = CI.I->getDebugLoc();
1425 const unsigned Opcode = getNewOpcode(CI, Paired);
1426
1427 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1428
1429 Register DestReg = MRI->createVirtualRegister(SuperRC);
1430 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1431
1432 // It shouldn't be possible to get this far if the two instructions
1433 // don't have a single memoperand, because MachineInstr::mayAlias()
1434 // will return true if this is the case.
1435 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1436
1438 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1439 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1440 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1441 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1442 New.addImm(MergedOffset);
1443 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1444
1445 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1446 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1447 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1448
1449 // Copy to the old destination registers.
1450 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1451 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1452 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1453
1454 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1455 .add(*Dest0) // Copy to same destination including flags and sub reg.
1456 .addReg(DestReg, 0, SubRegIdx0);
1457 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1458 .add(*Dest1)
1459 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1460
1461 CI.I->eraseFromParent();
1462 Paired.I->eraseFromParent();
1463 return New;
1464}
1465
1466MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1467 CombineInfo &CI, CombineInfo &Paired,
1468 MachineBasicBlock::iterator InsertBefore) {
1469 MachineBasicBlock *MBB = CI.I->getParent();
1470 DebugLoc DL = CI.I->getDebugLoc();
1471
1472 const unsigned Opcode = getNewOpcode(CI, Paired);
1473
1474 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1475
1476 // Copy to the new source register.
1477 Register DestReg = MRI->createVirtualRegister(SuperRC);
1478 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1479
1480 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1481
1482 AddressRegs Regs = getRegs(Opcode, *TII);
1483
1484 if (Regs.VAddr)
1485 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1486
1487 // It shouldn't be possible to get this far if the two instructions
1488 // don't have a single memoperand, because MachineInstr::mayAlias()
1489 // will return true if this is the case.
1490 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1491
1492 MachineInstr *New =
1493 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1494 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1495 .addImm(MergedOffset) // offset
1496 .addImm(CI.CPol) // cpol
1497 .addImm(0) // swz
1498 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1499
1500 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1501 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1502 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1503
1504 // Copy to the old destination registers.
1505 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1506 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1507 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1508
1509 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1510 .add(*Dest0) // Copy to same destination including flags and sub reg.
1511 .addReg(DestReg, 0, SubRegIdx0);
1512 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1513 .add(*Dest1)
1514 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1515
1516 CI.I->eraseFromParent();
1517 Paired.I->eraseFromParent();
1518 return New;
1519}
1520
1521MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1522 CombineInfo &CI, CombineInfo &Paired,
1523 MachineBasicBlock::iterator InsertBefore) {
1524 MachineBasicBlock *MBB = CI.I->getParent();
1525 DebugLoc DL = CI.I->getDebugLoc();
1526
1527 const unsigned Opcode = getNewOpcode(CI, Paired);
1528
1529 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1530
1531 // Copy to the new source register.
1532 Register DestReg = MRI->createVirtualRegister(SuperRC);
1533 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1534
1535 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1536
1537 AddressRegs Regs = getRegs(Opcode, *TII);
1538
1539 if (Regs.VAddr)
1540 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1541
1542 unsigned JoinedFormat =
1543 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1544
1545 // It shouldn't be possible to get this far if the two instructions
1546 // don't have a single memoperand, because MachineInstr::mayAlias()
1547 // will return true if this is the case.
1548 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1549
1550 MachineInstr *New =
1551 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1552 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1553 .addImm(MergedOffset) // offset
1554 .addImm(JoinedFormat) // format
1555 .addImm(CI.CPol) // cpol
1556 .addImm(0) // swz
1557 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1558
1559 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1560 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1561 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1562
1563 // Copy to the old destination registers.
1564 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1565 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1566 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1567
1568 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1569 .add(*Dest0) // Copy to same destination including flags and sub reg.
1570 .addReg(DestReg, 0, SubRegIdx0);
1571 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1572 .add(*Dest1)
1573 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1574
1575 CI.I->eraseFromParent();
1576 Paired.I->eraseFromParent();
1577 return New;
1578}
1579
1580MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1581 CombineInfo &CI, CombineInfo &Paired,
1582 MachineBasicBlock::iterator InsertBefore) {
1583 MachineBasicBlock *MBB = CI.I->getParent();
1584 DebugLoc DL = CI.I->getDebugLoc();
1585
1586 const unsigned Opcode = getNewOpcode(CI, Paired);
1587
1588 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1589 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1590 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1591
1592 // Copy to the new source register.
1593 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1594 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1595
1596 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1597 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1598
1599 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1600 .add(*Src0)
1601 .addImm(SubRegIdx0)
1602 .add(*Src1)
1603 .addImm(SubRegIdx1);
1604
1605 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1606 .addReg(SrcReg, RegState::Kill);
1607
1608 AddressRegs Regs = getRegs(Opcode, *TII);
1609
1610 if (Regs.VAddr)
1611 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1612
1613 unsigned JoinedFormat =
1614 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1615
1616 // It shouldn't be possible to get this far if the two instructions
1617 // don't have a single memoperand, because MachineInstr::mayAlias()
1618 // will return true if this is the case.
1619 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1620
1621 MachineInstr *New =
1622 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1623 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1624 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1625 .addImm(JoinedFormat) // format
1626 .addImm(CI.CPol) // cpol
1627 .addImm(0) // swz
1628 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1629
1630 CI.I->eraseFromParent();
1631 Paired.I->eraseFromParent();
1632 return New;
1633}
1634
1635MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1636 CombineInfo &CI, CombineInfo &Paired,
1637 MachineBasicBlock::iterator InsertBefore) {
1638 MachineBasicBlock *MBB = CI.I->getParent();
1639 DebugLoc DL = CI.I->getDebugLoc();
1640
1641 const unsigned Opcode = getNewOpcode(CI, Paired);
1642
1643 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1644 Register DestReg = MRI->createVirtualRegister(SuperRC);
1645
1646 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1647
1648 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1649 MIB.add(*SAddr);
1650
1651 MachineInstr *New =
1652 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1653 .addImm(std::min(CI.Offset, Paired.Offset))
1654 .addImm(CI.CPol)
1655 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1656
1657 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1658 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1659 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1660
1661 // Copy to the old destination registers.
1662 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1663 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1664 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1665
1666 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1667 .add(*Dest0) // Copy to same destination including flags and sub reg.
1668 .addReg(DestReg, 0, SubRegIdx0);
1669 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1670 .add(*Dest1)
1671 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1672
1673 CI.I->eraseFromParent();
1674 Paired.I->eraseFromParent();
1675 return New;
1676}
1677
1678MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1679 CombineInfo &CI, CombineInfo &Paired,
1680 MachineBasicBlock::iterator InsertBefore) {
1681 MachineBasicBlock *MBB = CI.I->getParent();
1682 DebugLoc DL = CI.I->getDebugLoc();
1683
1684 const unsigned Opcode = getNewOpcode(CI, Paired);
1685
1686 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1687 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1688 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1689
1690 // Copy to the new source register.
1691 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1692 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1693
1694 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1695 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1696
1697 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1698 .add(*Src0)
1699 .addImm(SubRegIdx0)
1700 .add(*Src1)
1701 .addImm(SubRegIdx1);
1702
1703 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1704 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1705 .addReg(SrcReg, RegState::Kill);
1706
1707 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1708 MIB.add(*SAddr);
1709
1710 MachineInstr *New =
1711 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1712 .addImm(CI.CPol)
1713 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1714
1715 CI.I->eraseFromParent();
1716 Paired.I->eraseFromParent();
1717 return New;
1718}
1719
1720unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1721 const CombineInfo &Paired) {
1722 const unsigned Width = CI.Width + Paired.Width;
1723
1724 switch (getCommonInstClass(CI, Paired)) {
1725 default:
1726 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1727 // FIXME: Handle d16 correctly
1728 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1729 Width);
1730 case TBUFFER_LOAD:
1731 case TBUFFER_STORE:
1732 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1733 Width);
1734
1735 case UNKNOWN:
1736 llvm_unreachable("Unknown instruction class");
1737 case S_BUFFER_LOAD_IMM:
1738 switch (Width) {
1739 default:
1740 return 0;
1741 case 2:
1742 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1743 case 3:
1744 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1745 case 4:
1746 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1747 case 8:
1748 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1749 }
1750 case S_BUFFER_LOAD_SGPR_IMM:
1751 switch (Width) {
1752 default:
1753 return 0;
1754 case 2:
1755 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1756 case 3:
1757 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1758 case 4:
1759 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1760 case 8:
1761 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1762 }
1763 case S_LOAD_IMM:
1764 switch (Width) {
1765 default:
1766 return 0;
1767 case 2:
1768 return AMDGPU::S_LOAD_DWORDX2_IMM;
1769 case 3:
1770 return AMDGPU::S_LOAD_DWORDX3_IMM;
1771 case 4:
1772 return AMDGPU::S_LOAD_DWORDX4_IMM;
1773 case 8:
1774 return AMDGPU::S_LOAD_DWORDX8_IMM;
1775 }
1776 case GLOBAL_LOAD:
1777 switch (Width) {
1778 default:
1779 return 0;
1780 case 2:
1781 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1782 case 3:
1783 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1784 case 4:
1785 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1786 }
1787 case GLOBAL_LOAD_SADDR:
1788 switch (Width) {
1789 default:
1790 return 0;
1791 case 2:
1792 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1793 case 3:
1794 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1795 case 4:
1796 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1797 }
1798 case GLOBAL_STORE:
1799 switch (Width) {
1800 default:
1801 return 0;
1802 case 2:
1803 return AMDGPU::GLOBAL_STORE_DWORDX2;
1804 case 3:
1805 return AMDGPU::GLOBAL_STORE_DWORDX3;
1806 case 4:
1807 return AMDGPU::GLOBAL_STORE_DWORDX4;
1808 }
1809 case GLOBAL_STORE_SADDR:
1810 switch (Width) {
1811 default:
1812 return 0;
1813 case 2:
1814 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1815 case 3:
1816 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1817 case 4:
1818 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1819 }
1820 case FLAT_LOAD:
1821 switch (Width) {
1822 default:
1823 return 0;
1824 case 2:
1825 return AMDGPU::FLAT_LOAD_DWORDX2;
1826 case 3:
1827 return AMDGPU::FLAT_LOAD_DWORDX3;
1828 case 4:
1829 return AMDGPU::FLAT_LOAD_DWORDX4;
1830 }
1831 case FLAT_STORE:
1832 switch (Width) {
1833 default:
1834 return 0;
1835 case 2:
1836 return AMDGPU::FLAT_STORE_DWORDX2;
1837 case 3:
1838 return AMDGPU::FLAT_STORE_DWORDX3;
1839 case 4:
1840 return AMDGPU::FLAT_STORE_DWORDX4;
1841 }
1842 case MIMG:
1843 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1844 "No overlaps");
1845 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1846 }
1847}
1848
1849std::pair<unsigned, unsigned>
1850SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1851 const CombineInfo &Paired) {
1852 assert((CI.InstClass != MIMG ||
1853 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1854 CI.Width + Paired.Width)) &&
1855 "No overlaps");
1856
1857 unsigned Idx0;
1858 unsigned Idx1;
1859
1860 static const unsigned Idxs[5][4] = {
1861 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1862 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1863 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1864 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1865 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1866 };
1867
1868 assert(CI.Width >= 1 && CI.Width <= 4);
1869 assert(Paired.Width >= 1 && Paired.Width <= 4);
1870
1871 if (Paired < CI) {
1872 Idx1 = Idxs[0][Paired.Width - 1];
1873 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1874 } else {
1875 Idx0 = Idxs[0][CI.Width - 1];
1876 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1877 }
1878
1879 return std::pair(Idx0, Idx1);
1880}
1881
1882const TargetRegisterClass *
1883SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1884 const CombineInfo &Paired) {
1885 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1886 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1887 switch (CI.Width + Paired.Width) {
1888 default:
1889 return nullptr;
1890 case 2:
1891 return &AMDGPU::SReg_64_XEXECRegClass;
1892 case 3:
1893 return &AMDGPU::SGPR_96RegClass;
1894 case 4:
1895 return &AMDGPU::SGPR_128RegClass;
1896 case 8:
1897 return &AMDGPU::SGPR_256RegClass;
1898 case 16:
1899 return &AMDGPU::SGPR_512RegClass;
1900 }
1901 }
1902
1903 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1904 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1905 ? TRI->getAGPRClassForBitWidth(BitWidth)
1906 : TRI->getVGPRClassForBitWidth(BitWidth);
1907}
1908
1909MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1910 CombineInfo &CI, CombineInfo &Paired,
1911 MachineBasicBlock::iterator InsertBefore) {
1912 MachineBasicBlock *MBB = CI.I->getParent();
1913 DebugLoc DL = CI.I->getDebugLoc();
1914
1915 const unsigned Opcode = getNewOpcode(CI, Paired);
1916
1917 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1918 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1919 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1920
1921 // Copy to the new source register.
1922 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1923 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1924
1925 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1926 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1927
1928 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1929 .add(*Src0)
1930 .addImm(SubRegIdx0)
1931 .add(*Src1)
1932 .addImm(SubRegIdx1);
1933
1934 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1935 .addReg(SrcReg, RegState::Kill);
1936
1937 AddressRegs Regs = getRegs(Opcode, *TII);
1938
1939 if (Regs.VAddr)
1940 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1941
1942
1943 // It shouldn't be possible to get this far if the two instructions
1944 // don't have a single memoperand, because MachineInstr::mayAlias()
1945 // will return true if this is the case.
1946 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1947
1948 MachineInstr *New =
1949 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1950 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1951 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1952 .addImm(CI.CPol) // cpol
1953 .addImm(0) // swz
1954 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1955
1956 CI.I->eraseFromParent();
1957 Paired.I->eraseFromParent();
1958 return New;
1959}
1960
1962SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1963 APInt V(32, Val, true);
1964 if (TII->isInlineConstant(V))
1965 return MachineOperand::CreateImm(Val);
1966
1967 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1968 MachineInstr *Mov =
1969 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1970 TII->get(AMDGPU::S_MOV_B32), Reg)
1971 .addImm(Val);
1972 (void)Mov;
1973 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1974 return MachineOperand::CreateReg(Reg, false);
1975}
1976
1977// Compute base address using Addr and return the final register.
1978Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1979 const MemAddress &Addr) const {
1980 MachineBasicBlock *MBB = MI.getParent();
1981 MachineBasicBlock::iterator MBBI = MI.getIterator();
1982 DebugLoc DL = MI.getDebugLoc();
1983
1984 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1985 Addr.Base.LoSubReg) &&
1986 "Expected 32-bit Base-Register-Low!!");
1987
1988 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1989 Addr.Base.HiSubReg) &&
1990 "Expected 32-bit Base-Register-Hi!!");
1991
1992 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1993 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1994 MachineOperand OffsetHi =
1995 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1996
1997 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1998 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1999 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2000
2001 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2002 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2003 MachineInstr *LoHalf =
2004 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2005 .addReg(CarryReg, RegState::Define)
2006 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2007 .add(OffsetLo)
2008 .addImm(0); // clamp bit
2009 (void)LoHalf;
2010 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
2011
2012 MachineInstr *HiHalf =
2013 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2014 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2015 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2016 .add(OffsetHi)
2017 .addReg(CarryReg, RegState::Kill)
2018 .addImm(0); // clamp bit
2019 (void)HiHalf;
2020 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
2021
2022 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2023 MachineInstr *FullBase =
2024 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2025 .addReg(DestSub0)
2026 .addImm(AMDGPU::sub0)
2027 .addReg(DestSub1)
2028 .addImm(AMDGPU::sub1);
2029 (void)FullBase;
2030 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
2031
2032 return FullDestReg;
2033}
2034
2035// Update base and offset with the NewBase and NewOffset in MI.
2036void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2037 Register NewBase,
2038 int32_t NewOffset) const {
2039 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2040 Base->setReg(NewBase);
2041 Base->setIsKill(false);
2042 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2043}
2044
2045std::optional<int32_t>
2046SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2047 if (Op.isImm())
2048 return Op.getImm();
2049
2050 if (!Op.isReg())
2051 return std::nullopt;
2052
2053 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2054 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2055 !Def->getOperand(1).isImm())
2056 return std::nullopt;
2057
2058 return Def->getOperand(1).getImm();
2059}
2060
2061// Analyze Base and extracts:
2062// - 32bit base registers, subregisters
2063// - 64bit constant offset
2064// Expecting base computation as:
2065// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2066// %LO:vgpr_32, %c:sreg_64_xexec =
2067// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2068// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2069// %Base:vreg_64 =
2070// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2071void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2072 MemAddress &Addr) const {
2073 if (!Base.isReg())
2074 return;
2075
2076 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2077 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2078 || Def->getNumOperands() != 5)
2079 return;
2080
2081 MachineOperand BaseLo = Def->getOperand(1);
2082 MachineOperand BaseHi = Def->getOperand(3);
2083 if (!BaseLo.isReg() || !BaseHi.isReg())
2084 return;
2085
2086 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2087 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2088
2089 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2090 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2091 return;
2092
2093 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2094 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2095
2096 auto Offset0P = extractConstOffset(*Src0);
2097 if (Offset0P)
2098 BaseLo = *Src1;
2099 else {
2100 if (!(Offset0P = extractConstOffset(*Src1)))
2101 return;
2102 BaseLo = *Src0;
2103 }
2104
2105 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2106 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2107
2108 if (Src0->isImm())
2109 std::swap(Src0, Src1);
2110
2111 if (!Src1->isImm())
2112 return;
2113
2114 uint64_t Offset1 = Src1->getImm();
2115 BaseHi = *Src0;
2116
2117 Addr.Base.LoReg = BaseLo.getReg();
2118 Addr.Base.HiReg = BaseHi.getReg();
2119 Addr.Base.LoSubReg = BaseLo.getSubReg();
2120 Addr.Base.HiSubReg = BaseHi.getSubReg();
2121 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2122}
2123
2124bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2126 MemInfoMap &Visited,
2127 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2128
2129 if (!(MI.mayLoad() ^ MI.mayStore()))
2130 return false;
2131
2132 // TODO: Support flat and scratch.
2133 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2134 return false;
2135
2136 if (MI.mayLoad() &&
2137 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2138 return false;
2139
2140 if (AnchorList.count(&MI))
2141 return false;
2142
2143 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2144
2145 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2146 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2147 return false;
2148 }
2149
2150 // Step1: Find the base-registers and a 64bit constant offset.
2151 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2152 MemAddress MAddr;
2153 if (!Visited.contains(&MI)) {
2154 processBaseWithConstOffset(Base, MAddr);
2155 Visited[&MI] = MAddr;
2156 } else
2157 MAddr = Visited[&MI];
2158
2159 if (MAddr.Offset == 0) {
2160 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2161 " constant offsets that can be promoted.\n";);
2162 return false;
2163 }
2164
2165 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2166 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2167
2168 // Step2: Traverse through MI's basic block and find an anchor(that has the
2169 // same base-registers) with the highest 13bit distance from MI's offset.
2170 // E.g. (64bit loads)
2171 // bb:
2172 // addr1 = &a + 4096; load1 = load(addr1, 0)
2173 // addr2 = &a + 6144; load2 = load(addr2, 0)
2174 // addr3 = &a + 8192; load3 = load(addr3, 0)
2175 // addr4 = &a + 10240; load4 = load(addr4, 0)
2176 // addr5 = &a + 12288; load5 = load(addr5, 0)
2177 //
2178 // Starting from the first load, the optimization will try to find a new base
2179 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2180 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2181 // as the new-base(anchor) because of the maximum distance which can
2182 // accommodate more intermediate bases presumably.
2183 //
2184 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2185 // (&a + 8192) for load1, load2, load4.
2186 // addr = &a + 8192
2187 // load1 = load(addr, -4096)
2188 // load2 = load(addr, -2048)
2189 // load3 = load(addr, 0)
2190 // load4 = load(addr, 2048)
2191 // addr5 = &a + 12288; load5 = load(addr5, 0)
2192 //
2193 MachineInstr *AnchorInst = nullptr;
2194 MemAddress AnchorAddr;
2195 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2197
2198 MachineBasicBlock *MBB = MI.getParent();
2200 MachineBasicBlock::iterator MBBI = MI.getIterator();
2201 ++MBBI;
2202 const SITargetLowering *TLI =
2203 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2204
2205 for ( ; MBBI != E; ++MBBI) {
2206 MachineInstr &MINext = *MBBI;
2207 // TODO: Support finding an anchor(with same base) from store addresses or
2208 // any other load addresses where the opcodes are different.
2209 if (MINext.getOpcode() != MI.getOpcode() ||
2210 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2211 continue;
2212
2213 const MachineOperand &BaseNext =
2214 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2215 MemAddress MAddrNext;
2216 if (!Visited.contains(&MINext)) {
2217 processBaseWithConstOffset(BaseNext, MAddrNext);
2218 Visited[&MINext] = MAddrNext;
2219 } else
2220 MAddrNext = Visited[&MINext];
2221
2222 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2223 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2224 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2225 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2226 continue;
2227
2228 InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2229
2230 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2232 AM.HasBaseReg = true;
2233 AM.BaseOffs = Dist;
2234 if (TLI->isLegalGlobalAddressingMode(AM) &&
2235 (uint32_t)std::abs(Dist) > MaxDist) {
2236 MaxDist = std::abs(Dist);
2237
2238 AnchorAddr = MAddrNext;
2239 AnchorInst = &MINext;
2240 }
2241 }
2242
2243 if (AnchorInst) {
2244 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2245 AnchorInst->dump());
2246 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2247 << AnchorAddr.Offset << "\n\n");
2248
2249 // Instead of moving up, just re-compute anchor-instruction's base address.
2250 Register Base = computeBase(MI, AnchorAddr);
2251
2252 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2253 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2254
2255 for (auto P : InstsWCommonBase) {
2257 AM.HasBaseReg = true;
2258 AM.BaseOffs = P.second - AnchorAddr.Offset;
2259
2260 if (TLI->isLegalGlobalAddressingMode(AM)) {
2261 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
2262 dbgs() << ")"; P.first->dump());
2263 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2264 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
2265 }
2266 }
2267 AnchorList.insert(AnchorInst);
2268 return true;
2269 }
2270
2271 return false;
2272}
2273
2274void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2275 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2276 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2277 if (AddrList.front().InstClass == CI.InstClass &&
2278 AddrList.front().IsAGPR == CI.IsAGPR &&
2279 AddrList.front().hasSameBaseAddress(CI)) {
2280 AddrList.emplace_back(CI);
2281 return;
2282 }
2283 }
2284
2285 // Base address not found, so add a new list.
2286 MergeableInsts.emplace_back(1, CI);
2287}
2288
2289std::pair<MachineBasicBlock::iterator, bool>
2290SILoadStoreOptimizer::collectMergeableInsts(
2292 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2293 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2294 bool Modified = false;
2295
2296 // Sort potential mergeable instructions into lists. One list per base address.
2297 unsigned Order = 0;
2298 MachineBasicBlock::iterator BlockI = Begin;
2299 for (; BlockI != End; ++BlockI) {
2300 MachineInstr &MI = *BlockI;
2301
2302 // We run this before checking if an address is mergeable, because it can produce
2303 // better code even if the instructions aren't mergeable.
2304 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2305 Modified = true;
2306
2307 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2308 // barriers. We can look after this barrier for separate merges.
2309 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2310 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2311
2312 // Search will resume after this instruction in a separate merge list.
2313 ++BlockI;
2314 break;
2315 }
2316
2317 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2318 if (InstClass == UNKNOWN)
2319 continue;
2320
2321 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2322 int Swizzled =
2323 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2324 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2325 continue;
2326
2327 CombineInfo CI;
2328 CI.setMI(MI, *this);
2329 CI.Order = Order++;
2330
2331 if (!CI.hasMergeableAddress(*MRI))
2332 continue;
2333
2334 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2335 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2336 // operands. However we are reporting that ds_write2 shall have
2337 // only VGPR data so that machine copy propagation does not
2338 // create an illegal instruction with a VGPR and AGPR sources.
2339 // Consequenctially if we create such instruction the verifier
2340 // will complain.
2341 continue;
2342 }
2343
2344 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2345
2346 addInstToMergeableList(CI, MergeableInsts);
2347 }
2348
2349 // At this point we have lists of Mergeable instructions.
2350 //
2351 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2352 // list try to find an instruction that can be merged with I. If an instruction
2353 // is found, it is stored in the Paired field. If no instructions are found, then
2354 // the CombineInfo object is deleted from the list.
2355
2356 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2357 E = MergeableInsts.end(); I != E;) {
2358
2359 std::list<CombineInfo> &MergeList = *I;
2360 if (MergeList.size() <= 1) {
2361 // This means we have found only one instruction with a given address
2362 // that can be merged, and we need at least 2 instructions to do a merge,
2363 // so this list can be discarded.
2364 I = MergeableInsts.erase(I);
2365 continue;
2366 }
2367
2368 // Sort the lists by offsets, this way mergeable instructions will be
2369 // adjacent to each other in the list, which will make it easier to find
2370 // matches.
2371 MergeList.sort(
2372 [] (const CombineInfo &A, const CombineInfo &B) {
2373 return A.Offset < B.Offset;
2374 });
2375 ++I;
2376 }
2377
2378 return std::pair(BlockI, Modified);
2379}
2380
2381// Scan through looking for adjacent LDS operations with constant offsets from
2382// the same base register. We rely on the scheduler to do the hard work of
2383// clustering nearby loads, and assume these are all adjacent.
2384bool SILoadStoreOptimizer::optimizeBlock(
2385 std::list<std::list<CombineInfo> > &MergeableInsts) {
2386 bool Modified = false;
2387
2388 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2389 E = MergeableInsts.end(); I != E;) {
2390 std::list<CombineInfo> &MergeList = *I;
2391
2392 bool OptimizeListAgain = false;
2393 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2394 // We weren't able to make any changes, so delete the list so we don't
2395 // process the same instructions the next time we try to optimize this
2396 // block.
2397 I = MergeableInsts.erase(I);
2398 continue;
2399 }
2400
2401 Modified = true;
2402
2403 // We made changes, but also determined that there were no more optimization
2404 // opportunities, so we don't need to reprocess the list
2405 if (!OptimizeListAgain) {
2406 I = MergeableInsts.erase(I);
2407 continue;
2408 }
2409 OptimizeAgain = true;
2410 }
2411 return Modified;
2412}
2413
2414bool
2415SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2416 std::list<CombineInfo> &MergeList,
2417 bool &OptimizeListAgain) {
2418 if (MergeList.empty())
2419 return false;
2420
2421 bool Modified = false;
2422
2423 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2424 Next = std::next(I)) {
2425
2426 auto First = I;
2427 auto Second = Next;
2428
2429 if ((*First).Order > (*Second).Order)
2430 std::swap(First, Second);
2431 CombineInfo &CI = *First;
2432 CombineInfo &Paired = *Second;
2433
2434 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2435 if (!Where) {
2436 ++I;
2437 continue;
2438 }
2439
2440 Modified = true;
2441
2442 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2443
2445 switch (CI.InstClass) {
2446 default:
2447 llvm_unreachable("unknown InstClass");
2448 break;
2449 case DS_READ:
2450 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2451 break;
2452 case DS_WRITE:
2453 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2454 break;
2455 case S_BUFFER_LOAD_IMM:
2456 case S_BUFFER_LOAD_SGPR_IMM:
2457 case S_LOAD_IMM:
2458 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2459 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2460 break;
2461 case BUFFER_LOAD:
2462 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2463 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2464 break;
2465 case BUFFER_STORE:
2466 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2467 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2468 break;
2469 case MIMG:
2470 NewMI = mergeImagePair(CI, Paired, Where->I);
2471 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2472 break;
2473 case TBUFFER_LOAD:
2474 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2475 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2476 break;
2477 case TBUFFER_STORE:
2478 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2479 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2480 break;
2481 case FLAT_LOAD:
2482 case GLOBAL_LOAD:
2483 case GLOBAL_LOAD_SADDR:
2484 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2485 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2486 break;
2487 case FLAT_STORE:
2488 case GLOBAL_STORE:
2489 case GLOBAL_STORE_SADDR:
2490 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2491 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2492 break;
2493 }
2494 CI.setMI(NewMI, *this);
2495 CI.Order = Where->Order;
2496 if (I == Second)
2497 I = Next;
2498
2499 MergeList.erase(Second);
2500 }
2501
2502 return Modified;
2503}
2504
2505bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2506 if (skipFunction(MF.getFunction()))
2507 return false;
2508
2509 STM = &MF.getSubtarget<GCNSubtarget>();
2510 if (!STM->loadStoreOptEnabled())
2511 return false;
2512
2513 TII = STM->getInstrInfo();
2514 TRI = &TII->getRegisterInfo();
2515
2516 MRI = &MF.getRegInfo();
2517 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2518
2519 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2520
2521 bool Modified = false;
2522
2523 // Contains the list of instructions for which constant offsets are being
2524 // promoted to the IMM. This is tracked for an entire block at time.
2526 MemInfoMap Visited;
2527
2528 for (MachineBasicBlock &MBB : MF) {
2529 MachineBasicBlock::iterator SectionEnd;
2530 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2531 I = SectionEnd) {
2532 bool CollectModified;
2533 std::list<std::list<CombineInfo>> MergeableInsts;
2534
2535 // First pass: Collect list of all instructions we know how to merge in a
2536 // subset of the block.
2537 std::tie(SectionEnd, CollectModified) =
2538 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2539
2540 Modified |= CollectModified;
2541
2542 do {
2543 OptimizeAgain = false;
2544 Modified |= optimizeBlock(MergeableInsts);
2545 } while (OptimizeAgain);
2546 }
2547
2548 Visited.clear();
2549 AnchorList.clear();
2550 }
2551
2552 return Modified;
2553}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1291
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
#define op(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
SI Load Store Optimizer
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
#define DEBUG_TYPE
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition: APInt.h:76
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:251
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:259
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:688
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:948
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
void dump() const
Definition: Pass.cpp:136
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:361
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...