LLVM 20.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "AMDGPU.h"
61#include "GCNSubtarget.h"
66
67using namespace llvm;
68
69#define DEBUG_TYPE "si-load-store-opt"
70
71namespace {
72enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 S_BUFFER_LOAD_SGPR_IMM,
78 S_LOAD_IMM,
79 BUFFER_LOAD,
80 BUFFER_STORE,
81 MIMG,
82 TBUFFER_LOAD,
83 TBUFFER_STORE,
84 GLOBAL_LOAD_SADDR,
85 GLOBAL_STORE_SADDR,
86 FLAT_LOAD,
87 FLAT_STORE,
88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89 GLOBAL_STORE // any CombineInfo, they are only ever returned by
90 // getCommonInstClass.
91};
92
93struct AddressRegs {
94 unsigned char NumVAddrs = 0;
95 bool SBase = false;
96 bool SRsrc = false;
97 bool SOffset = false;
98 bool SAddr = false;
99 bool VAddr = false;
100 bool Addr = false;
101 bool SSamp = false;
102};
103
104// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105const unsigned MaxAddressRegs = 12 + 1 + 1;
106
107class SILoadStoreOptimizer : public MachineFunctionPass {
108 struct CombineInfo {
110 unsigned EltSize;
111 unsigned Offset;
112 unsigned Width;
113 unsigned Format;
114 unsigned BaseOff;
115 unsigned DMask;
116 InstClassEnum InstClass;
117 unsigned CPol = 0;
118 bool IsAGPR;
119 bool UseST64;
120 int AddrIdx[MaxAddressRegs];
121 const MachineOperand *AddrReg[MaxAddressRegs];
122 unsigned NumAddresses;
123 unsigned Order;
124
125 bool hasSameBaseAddress(const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
127 return false;
128
129 const MachineInstr &MI = *CI.I;
130 for (unsigned i = 0; i < NumAddresses; i++) {
131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132
133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136 return false;
137 }
138 continue;
139 }
140
141 // Check same base pointer. Be careful of subregisters, which can occur
142 // with vectors of pointers.
143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145 return false;
146 }
147 }
148 return true;
149 }
150
151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152 for (unsigned i = 0; i < NumAddresses; ++i) {
153 const MachineOperand *AddrOp = AddrReg[i];
154 // Immediates are always OK.
155 if (AddrOp->isImm())
156 continue;
157
158 // Don't try to merge addresses that aren't either immediates or registers.
159 // TODO: Should be possible to merge FrameIndexes and maybe some other
160 // non-register
161 if (!AddrOp->isReg())
162 return false;
163
164 // TODO: We should be able to merge instructions with other physical reg
165 // addresses too.
166 if (AddrOp->getReg().isPhysical() &&
167 AddrOp->getReg() != AMDGPU::SGPR_NULL)
168 return false;
169
170 // If an address has only one use then there will be no other
171 // instructions with the same address, so we can't merge this one.
172 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173 return false;
174 }
175 return true;
176 }
177
178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179
180 // Compare by pointer order.
181 bool operator<(const CombineInfo& Other) const {
182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183 }
184 };
185
186 struct BaseRegisters {
187 Register LoReg;
188 Register HiReg;
189
190 unsigned LoSubReg = 0;
191 unsigned HiSubReg = 0;
192 };
193
194 struct MemAddress {
195 BaseRegisters Base;
196 int64_t Offset = 0;
197 };
198
199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200
201private:
202 const GCNSubtarget *STM = nullptr;
203 const SIInstrInfo *TII = nullptr;
204 const SIRegisterInfo *TRI = nullptr;
205 MachineRegisterInfo *MRI = nullptr;
206 AliasAnalysis *AA = nullptr;
207 bool OptimizeAgain;
208
209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210 const DenseSet<Register> &ARegUses,
211 const MachineInstr &A, const MachineInstr &B) const;
212 static bool dmasksCanBeCombined(const CombineInfo &CI,
213 const SIInstrInfo &TII,
214 const CombineInfo &Paired);
215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216 CombineInfo &Paired, bool Modify = false);
217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218 const CombineInfo &Paired);
219 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221 const CombineInfo &Paired);
222 const TargetRegisterClass *
223 getTargetRegisterClass(const CombineInfo &CI,
224 const CombineInfo &Paired) const;
225 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
226
227 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
228
229 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
230 MachineBasicBlock::iterator InsertBefore, int OpName,
231 Register DestReg) const;
232 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
233 MachineBasicBlock::iterator InsertBefore,
234 int OpName) const;
235
236 unsigned read2Opcode(unsigned EltSize) const;
237 unsigned read2ST64Opcode(unsigned EltSize) const;
239 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
240 MachineBasicBlock::iterator InsertBefore);
241
242 unsigned write2Opcode(unsigned EltSize) const;
243 unsigned write2ST64Opcode(unsigned EltSize) const;
245 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
246 MachineBasicBlock::iterator InsertBefore);
248 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
249 MachineBasicBlock::iterator InsertBefore);
251 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
252 MachineBasicBlock::iterator InsertBefore);
254 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
255 MachineBasicBlock::iterator InsertBefore);
257 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
258 MachineBasicBlock::iterator InsertBefore);
260 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
261 MachineBasicBlock::iterator InsertBefore);
263 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
264 MachineBasicBlock::iterator InsertBefore);
266 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
267 MachineBasicBlock::iterator InsertBefore);
269 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
270 MachineBasicBlock::iterator InsertBefore);
271
272 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
273 int32_t NewOffset) const;
274 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
275 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
276 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
277 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
278 /// Promotes constant offset to the immediate by adjusting the base. It
279 /// tries to use a base from the nearby instructions that allows it to have
280 /// a 13bit constant offset which gets promoted to the immediate.
281 bool promoteConstantOffsetToImm(MachineInstr &CI,
282 MemInfoMap &Visited,
283 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
284 void addInstToMergeableList(const CombineInfo &CI,
285 std::list<std::list<CombineInfo> > &MergeableInsts) const;
286
287 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
289 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
290 std::list<std::list<CombineInfo>> &MergeableInsts) const;
291
292 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
293 const CombineInfo &Paired);
294
295 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
296 const CombineInfo &Paired);
297
298public:
299 static char ID;
300
301 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
303 }
304
305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306 bool &OptimizeListAgain);
307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
308
309 bool runOnMachineFunction(MachineFunction &MF) override;
310
311 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
312
313 void getAnalysisUsage(AnalysisUsage &AU) const override {
314 AU.setPreservesCFG();
316
318 }
319
320 MachineFunctionProperties getRequiredProperties() const override {
322 .set(MachineFunctionProperties::Property::IsSSA);
323 }
324};
325
326static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
327 const unsigned Opc = MI.getOpcode();
328
329 if (TII.isMUBUF(Opc)) {
330 // FIXME: Handle d16 correctly
331 return AMDGPU::getMUBUFElements(Opc);
332 }
333 if (TII.isImage(MI)) {
334 uint64_t DMaskImm =
335 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
336 return llvm::popcount(DMaskImm);
337 }
338 if (TII.isMTBUF(Opc)) {
339 return AMDGPU::getMTBUFElements(Opc);
340 }
341
342 switch (Opc) {
343 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345 case AMDGPU::S_LOAD_DWORD_IMM:
346 case AMDGPU::GLOBAL_LOAD_DWORD:
347 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348 case AMDGPU::GLOBAL_STORE_DWORD:
349 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350 case AMDGPU::FLAT_LOAD_DWORD:
351 case AMDGPU::FLAT_STORE_DWORD:
352 return 1;
353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
356 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
357 case AMDGPU::S_LOAD_DWORDX2_IMM:
358 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
359 case AMDGPU::GLOBAL_LOAD_DWORDX2:
360 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
361 case AMDGPU::GLOBAL_STORE_DWORDX2:
362 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
363 case AMDGPU::FLAT_LOAD_DWORDX2:
364 case AMDGPU::FLAT_STORE_DWORDX2:
365 return 2;
366 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
367 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
368 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
369 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
370 case AMDGPU::S_LOAD_DWORDX3_IMM:
371 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
372 case AMDGPU::GLOBAL_LOAD_DWORDX3:
373 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
374 case AMDGPU::GLOBAL_STORE_DWORDX3:
375 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
376 case AMDGPU::FLAT_LOAD_DWORDX3:
377 case AMDGPU::FLAT_STORE_DWORDX3:
378 return 3;
379 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
380 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
381 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
382 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
383 case AMDGPU::S_LOAD_DWORDX4_IMM:
384 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
385 case AMDGPU::GLOBAL_LOAD_DWORDX4:
386 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
387 case AMDGPU::GLOBAL_STORE_DWORDX4:
388 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
389 case AMDGPU::FLAT_LOAD_DWORDX4:
390 case AMDGPU::FLAT_STORE_DWORDX4:
391 return 4;
392 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
393 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
394 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
395 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
396 case AMDGPU::S_LOAD_DWORDX8_IMM:
397 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
398 return 8;
399 case AMDGPU::DS_READ_B32:
400 case AMDGPU::DS_READ_B32_gfx9:
401 case AMDGPU::DS_WRITE_B32:
402 case AMDGPU::DS_WRITE_B32_gfx9:
403 return 1;
404 case AMDGPU::DS_READ_B64:
405 case AMDGPU::DS_READ_B64_gfx9:
406 case AMDGPU::DS_WRITE_B64:
407 case AMDGPU::DS_WRITE_B64_gfx9:
408 return 2;
409 default:
410 return 0;
411 }
412}
413
414/// Maps instruction opcode to enum InstClassEnum.
415static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
416 switch (Opc) {
417 default:
418 if (TII.isMUBUF(Opc)) {
419 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
420 default:
421 return UNKNOWN;
422 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
423 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
424 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
425 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
426 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
427 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
428 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
429 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
430 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
431 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
432 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
433 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
434 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
435 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
436 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
437 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
438 return BUFFER_LOAD;
439 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
440 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
441 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
442 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
443 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
444 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
445 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
446 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
447 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
448 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
449 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
450 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
451 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
452 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
453 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
454 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
455 return BUFFER_STORE;
456 }
457 }
458 if (TII.isImage(Opc)) {
459 // Ignore instructions encoded without vaddr.
460 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
461 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
462 return UNKNOWN;
463 // Ignore BVH instructions
464 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
465 return UNKNOWN;
466 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
467 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
468 TII.isGather4(Opc))
469 return UNKNOWN;
470 return MIMG;
471 }
472 if (TII.isMTBUF(Opc)) {
473 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
474 default:
475 return UNKNOWN;
476 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
477 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
478 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
479 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
480 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
484 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
485 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
486 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
487 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
488 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
489 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
490 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
491 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
492 return TBUFFER_LOAD;
493 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
494 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
495 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
496 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
497 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
498 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
499 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
500 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
501 return TBUFFER_STORE;
502 }
503 }
504 return UNKNOWN;
505 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
506 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
507 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
508 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
509 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
510 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
511 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
512 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
513 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
514 return S_BUFFER_LOAD_IMM;
515 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
516 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
517 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
518 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
519 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
520 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
521 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
522 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
523 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
524 return S_BUFFER_LOAD_SGPR_IMM;
525 case AMDGPU::S_LOAD_DWORD_IMM:
526 case AMDGPU::S_LOAD_DWORDX2_IMM:
527 case AMDGPU::S_LOAD_DWORDX3_IMM:
528 case AMDGPU::S_LOAD_DWORDX4_IMM:
529 case AMDGPU::S_LOAD_DWORDX8_IMM:
530 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
531 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
532 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
533 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
534 return S_LOAD_IMM;
535 case AMDGPU::DS_READ_B32:
536 case AMDGPU::DS_READ_B32_gfx9:
537 case AMDGPU::DS_READ_B64:
538 case AMDGPU::DS_READ_B64_gfx9:
539 return DS_READ;
540 case AMDGPU::DS_WRITE_B32:
541 case AMDGPU::DS_WRITE_B32_gfx9:
542 case AMDGPU::DS_WRITE_B64:
543 case AMDGPU::DS_WRITE_B64_gfx9:
544 return DS_WRITE;
545 case AMDGPU::GLOBAL_LOAD_DWORD:
546 case AMDGPU::GLOBAL_LOAD_DWORDX2:
547 case AMDGPU::GLOBAL_LOAD_DWORDX3:
548 case AMDGPU::GLOBAL_LOAD_DWORDX4:
549 case AMDGPU::FLAT_LOAD_DWORD:
550 case AMDGPU::FLAT_LOAD_DWORDX2:
551 case AMDGPU::FLAT_LOAD_DWORDX3:
552 case AMDGPU::FLAT_LOAD_DWORDX4:
553 return FLAT_LOAD;
554 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
555 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
556 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
557 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
558 return GLOBAL_LOAD_SADDR;
559 case AMDGPU::GLOBAL_STORE_DWORD:
560 case AMDGPU::GLOBAL_STORE_DWORDX2:
561 case AMDGPU::GLOBAL_STORE_DWORDX3:
562 case AMDGPU::GLOBAL_STORE_DWORDX4:
563 case AMDGPU::FLAT_STORE_DWORD:
564 case AMDGPU::FLAT_STORE_DWORDX2:
565 case AMDGPU::FLAT_STORE_DWORDX3:
566 case AMDGPU::FLAT_STORE_DWORDX4:
567 return FLAT_STORE;
568 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
569 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
570 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
571 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
572 return GLOBAL_STORE_SADDR;
573 }
574}
575
576/// Determines instruction subclass from opcode. Only instructions
577/// of the same subclass can be merged together. The merged instruction may have
578/// a different subclass but must have the same class.
579static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
580 switch (Opc) {
581 default:
582 if (TII.isMUBUF(Opc))
583 return AMDGPU::getMUBUFBaseOpcode(Opc);
584 if (TII.isImage(Opc)) {
586 assert(Info);
587 return Info->BaseOpcode;
588 }
589 if (TII.isMTBUF(Opc))
590 return AMDGPU::getMTBUFBaseOpcode(Opc);
591 return -1;
592 case AMDGPU::DS_READ_B32:
593 case AMDGPU::DS_READ_B32_gfx9:
594 case AMDGPU::DS_READ_B64:
595 case AMDGPU::DS_READ_B64_gfx9:
596 case AMDGPU::DS_WRITE_B32:
597 case AMDGPU::DS_WRITE_B32_gfx9:
598 case AMDGPU::DS_WRITE_B64:
599 case AMDGPU::DS_WRITE_B64_gfx9:
600 return Opc;
601 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
602 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
603 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
604 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
605 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
606 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
607 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
608 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
609 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
610 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
611 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
612 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
613 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
614 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
615 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
616 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
617 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
618 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
619 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
620 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
621 case AMDGPU::S_LOAD_DWORD_IMM:
622 case AMDGPU::S_LOAD_DWORDX2_IMM:
623 case AMDGPU::S_LOAD_DWORDX3_IMM:
624 case AMDGPU::S_LOAD_DWORDX4_IMM:
625 case AMDGPU::S_LOAD_DWORDX8_IMM:
626 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
627 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
628 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
629 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
630 return AMDGPU::S_LOAD_DWORD_IMM;
631 case AMDGPU::GLOBAL_LOAD_DWORD:
632 case AMDGPU::GLOBAL_LOAD_DWORDX2:
633 case AMDGPU::GLOBAL_LOAD_DWORDX3:
634 case AMDGPU::GLOBAL_LOAD_DWORDX4:
635 case AMDGPU::FLAT_LOAD_DWORD:
636 case AMDGPU::FLAT_LOAD_DWORDX2:
637 case AMDGPU::FLAT_LOAD_DWORDX3:
638 case AMDGPU::FLAT_LOAD_DWORDX4:
639 return AMDGPU::FLAT_LOAD_DWORD;
640 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
641 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
642 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
643 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
644 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
645 case AMDGPU::GLOBAL_STORE_DWORD:
646 case AMDGPU::GLOBAL_STORE_DWORDX2:
647 case AMDGPU::GLOBAL_STORE_DWORDX3:
648 case AMDGPU::GLOBAL_STORE_DWORDX4:
649 case AMDGPU::FLAT_STORE_DWORD:
650 case AMDGPU::FLAT_STORE_DWORDX2:
651 case AMDGPU::FLAT_STORE_DWORDX3:
652 case AMDGPU::FLAT_STORE_DWORDX4:
653 return AMDGPU::FLAT_STORE_DWORD;
654 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
655 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
656 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
657 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
658 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
659 }
660}
661
662// GLOBAL loads and stores are classified as FLAT initially. If both combined
663// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
664// If either or both instructions are non segment specific FLAT the resulting
665// combined operation will be FLAT, potentially promoting one of the GLOBAL
666// operations to FLAT.
667// For other instructions return the original unmodified class.
668InstClassEnum
669SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
670 const CombineInfo &Paired) {
671 assert(CI.InstClass == Paired.InstClass);
672
673 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
675 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
676
677 return CI.InstClass;
678}
679
680static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
681 AddressRegs Result;
682
683 if (TII.isMUBUF(Opc)) {
685 Result.VAddr = true;
687 Result.SRsrc = true;
689 Result.SOffset = true;
690
691 return Result;
692 }
693
694 if (TII.isImage(Opc)) {
695 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
696 if (VAddr0Idx >= 0) {
697 int RsrcName =
698 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
699 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
700 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
701 } else {
702 Result.VAddr = true;
703 }
704 Result.SRsrc = true;
706 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
707 Result.SSamp = true;
708
709 return Result;
710 }
711 if (TII.isMTBUF(Opc)) {
713 Result.VAddr = true;
715 Result.SRsrc = true;
717 Result.SOffset = true;
718
719 return Result;
720 }
721
722 switch (Opc) {
723 default:
724 return Result;
725 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
726 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
727 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
728 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
729 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
730 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
731 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
732 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
733 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
734 Result.SOffset = true;
735 [[fallthrough]];
736 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
737 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
738 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
739 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
740 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
741 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
742 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
743 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
744 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
745 case AMDGPU::S_LOAD_DWORD_IMM:
746 case AMDGPU::S_LOAD_DWORDX2_IMM:
747 case AMDGPU::S_LOAD_DWORDX3_IMM:
748 case AMDGPU::S_LOAD_DWORDX4_IMM:
749 case AMDGPU::S_LOAD_DWORDX8_IMM:
750 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
751 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
752 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
753 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
754 Result.SBase = true;
755 return Result;
756 case AMDGPU::DS_READ_B32:
757 case AMDGPU::DS_READ_B64:
758 case AMDGPU::DS_READ_B32_gfx9:
759 case AMDGPU::DS_READ_B64_gfx9:
760 case AMDGPU::DS_WRITE_B32:
761 case AMDGPU::DS_WRITE_B64:
762 case AMDGPU::DS_WRITE_B32_gfx9:
763 case AMDGPU::DS_WRITE_B64_gfx9:
764 Result.Addr = true;
765 return Result;
766 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
767 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
768 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
769 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
770 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
771 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
772 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
773 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
774 Result.SAddr = true;
775 [[fallthrough]];
776 case AMDGPU::GLOBAL_LOAD_DWORD:
777 case AMDGPU::GLOBAL_LOAD_DWORDX2:
778 case AMDGPU::GLOBAL_LOAD_DWORDX3:
779 case AMDGPU::GLOBAL_LOAD_DWORDX4:
780 case AMDGPU::GLOBAL_STORE_DWORD:
781 case AMDGPU::GLOBAL_STORE_DWORDX2:
782 case AMDGPU::GLOBAL_STORE_DWORDX3:
783 case AMDGPU::GLOBAL_STORE_DWORDX4:
784 case AMDGPU::FLAT_LOAD_DWORD:
785 case AMDGPU::FLAT_LOAD_DWORDX2:
786 case AMDGPU::FLAT_LOAD_DWORDX3:
787 case AMDGPU::FLAT_LOAD_DWORDX4:
788 case AMDGPU::FLAT_STORE_DWORD:
789 case AMDGPU::FLAT_STORE_DWORDX2:
790 case AMDGPU::FLAT_STORE_DWORDX3:
791 case AMDGPU::FLAT_STORE_DWORDX4:
792 Result.VAddr = true;
793 return Result;
794 }
795}
796
797void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
798 const SILoadStoreOptimizer &LSO) {
799 I = MI;
800 unsigned Opc = MI->getOpcode();
801 InstClass = getInstClass(Opc, *LSO.TII);
802
803 if (InstClass == UNKNOWN)
804 return;
805
806 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
807
808 switch (InstClass) {
809 case DS_READ:
810 EltSize =
811 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
812 : 4;
813 break;
814 case DS_WRITE:
815 EltSize =
816 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
817 : 4;
818 break;
819 case S_BUFFER_LOAD_IMM:
820 case S_BUFFER_LOAD_SGPR_IMM:
821 case S_LOAD_IMM:
822 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
823 break;
824 default:
825 EltSize = 4;
826 break;
827 }
828
829 if (InstClass == MIMG) {
830 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
831 // Offset is not considered for MIMG instructions.
832 Offset = 0;
833 } else {
834 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
835 Offset = I->getOperand(OffsetIdx).getImm();
836 }
837
838 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
839 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
840
841 Width = getOpcodeWidth(*I, *LSO.TII);
842
843 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
844 Offset &= 0xffff;
845 } else if (InstClass != MIMG) {
846 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
847 }
848
849 AddressRegs Regs = getRegs(Opc, *LSO.TII);
850 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
851
852 NumAddresses = 0;
853 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
854 AddrIdx[NumAddresses++] =
855 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
856 if (Regs.Addr)
857 AddrIdx[NumAddresses++] =
858 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
859 if (Regs.SBase)
860 AddrIdx[NumAddresses++] =
861 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
862 if (Regs.SRsrc)
863 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
864 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
865 if (Regs.SOffset)
866 AddrIdx[NumAddresses++] =
867 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
868 if (Regs.SAddr)
869 AddrIdx[NumAddresses++] =
870 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
871 if (Regs.VAddr)
872 AddrIdx[NumAddresses++] =
873 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
874 if (Regs.SSamp)
875 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
876 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
877 assert(NumAddresses <= MaxAddressRegs);
878
879 for (unsigned J = 0; J < NumAddresses; J++)
880 AddrReg[J] = &I->getOperand(AddrIdx[J]);
881}
882
883} // end anonymous namespace.
884
885INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
886 "SI Load Store Optimizer", false, false)
888INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
890
891char SILoadStoreOptimizer::ID = 0;
892
893char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
894
896 return new SILoadStoreOptimizer();
897}
898
900 DenseSet<Register> &RegDefs,
901 DenseSet<Register> &RegUses) {
902 for (const auto &Op : MI.operands()) {
903 if (!Op.isReg())
904 continue;
905 if (Op.isDef())
906 RegDefs.insert(Op.getReg());
907 if (Op.readsReg())
908 RegUses.insert(Op.getReg());
909 }
910}
911
912bool SILoadStoreOptimizer::canSwapInstructions(
913 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
914 const MachineInstr &A, const MachineInstr &B) const {
915 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
916 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
917 return false;
918 for (const auto &BOp : B.operands()) {
919 if (!BOp.isReg())
920 continue;
921 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
922 return false;
923 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
924 return false;
925 }
926 return true;
927}
928
929// Given that \p CI and \p Paired are adjacent memory operations produce a new
930// MMO for the combined operation with a new access size.
932SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
933 const CombineInfo &Paired) {
934 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
935 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
936
937 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
938
939 // A base pointer for the combined operation is the same as the leading
940 // operation's pointer.
941 if (Paired < CI)
942 std::swap(MMOa, MMOb);
943
944 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
945 // If merging FLAT and GLOBAL set address space to FLAT.
947 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
948
949 MachineFunction *MF = CI.I->getMF();
950 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
951}
952
953bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
954 const SIInstrInfo &TII,
955 const CombineInfo &Paired) {
956 assert(CI.InstClass == MIMG);
957
958 // Ignore instructions with tfe/lwe set.
959 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
960 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
961
962 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
963 return false;
964
965 // Check other optional immediate operands for equality.
966 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
967 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
968 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
969
970 for (auto op : OperandsToMatch) {
971 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
972 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
973 return false;
974 if (Idx != -1 &&
975 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
976 return false;
977 }
978
979 // Check DMask for overlaps.
980 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
981 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
982
983 if (!MaxMask)
984 return false;
985
986 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
987 if ((1u << AllowedBitsForMin) <= MinMask)
988 return false;
989
990 return true;
991}
992
993static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
994 unsigned ComponentCount,
995 const GCNSubtarget &STI) {
996 if (ComponentCount > 4)
997 return 0;
998
999 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1001 if (!OldFormatInfo)
1002 return 0;
1003
1004 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1006 ComponentCount,
1007 OldFormatInfo->NumFormat, STI);
1008
1009 if (!NewFormatInfo)
1010 return 0;
1011
1012 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1013 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1014
1015 return NewFormatInfo->Format;
1016}
1017
1018// Return the value in the inclusive range [Lo,Hi] that is aligned to the
1019// highest power of two. Note that the result is well defined for all inputs
1020// including corner cases like:
1021// - if Lo == Hi, return that value
1022// - if Lo == 0, return 0 (even though the "- 1" below underflows
1023// - if Lo > Hi, return 0 (as if the range wrapped around)
1025 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
1026}
1027
1028bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1029 const GCNSubtarget &STI,
1030 CombineInfo &Paired,
1031 bool Modify) {
1032 assert(CI.InstClass != MIMG);
1033
1034 // XXX - Would the same offset be OK? Is there any reason this would happen or
1035 // be useful?
1036 if (CI.Offset == Paired.Offset)
1037 return false;
1038
1039 // This won't be valid if the offset isn't aligned.
1040 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1041 return false;
1042
1043 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1044
1047 if (!Info0)
1048 return false;
1050 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1051 if (!Info1)
1052 return false;
1053
1054 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1055 Info0->NumFormat != Info1->NumFormat)
1056 return false;
1057
1058 // TODO: Should be possible to support more formats, but if format loads
1059 // are not dword-aligned, the merged load might not be valid.
1060 if (Info0->BitsPerComp != 32)
1061 return false;
1062
1063 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1064 return false;
1065 }
1066
1067 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1068 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1069 CI.UseST64 = false;
1070 CI.BaseOff = 0;
1071
1072 // Handle all non-DS instructions.
1073 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1074 if (EltOffset0 + CI.Width != EltOffset1 &&
1075 EltOffset1 + Paired.Width != EltOffset0)
1076 return false;
1077 if (CI.CPol != Paired.CPol)
1078 return false;
1079 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1080 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1081 // Reject cases like:
1082 // dword + dwordx2 -> dwordx3
1083 // dword + dwordx3 -> dwordx4
1084 // If we tried to combine these cases, we would fail to extract a subreg
1085 // for the result of the second load due to SGPR alignment requirements.
1086 if (CI.Width != Paired.Width &&
1087 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1088 return false;
1089 }
1090 return true;
1091 }
1092
1093 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1094 // the stride 64 versions.
1095 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1096 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1097 if (Modify) {
1098 CI.Offset = EltOffset0 / 64;
1099 Paired.Offset = EltOffset1 / 64;
1100 CI.UseST64 = true;
1101 }
1102 return true;
1103 }
1104
1105 // Check if the new offsets fit in the reduced 8-bit range.
1106 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1107 if (Modify) {
1108 CI.Offset = EltOffset0;
1109 Paired.Offset = EltOffset1;
1110 }
1111 return true;
1112 }
1113
1114 // Try to shift base address to decrease offsets.
1115 uint32_t Min = std::min(EltOffset0, EltOffset1);
1116 uint32_t Max = std::max(EltOffset0, EltOffset1);
1117
1118 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1119 if (((Max - Min) & ~Mask) == 0) {
1120 if (Modify) {
1121 // From the range of values we could use for BaseOff, choose the one that
1122 // is aligned to the highest power of two, to maximise the chance that
1123 // the same offset can be reused for other load/store pairs.
1124 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1125 // Copy the low bits of the offsets, so that when we adjust them by
1126 // subtracting BaseOff they will be multiples of 64.
1127 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1128 CI.BaseOff = BaseOff * CI.EltSize;
1129 CI.Offset = (EltOffset0 - BaseOff) / 64;
1130 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1131 CI.UseST64 = true;
1132 }
1133 return true;
1134 }
1135
1136 if (isUInt<8>(Max - Min)) {
1137 if (Modify) {
1138 // From the range of values we could use for BaseOff, choose the one that
1139 // is aligned to the highest power of two, to maximise the chance that
1140 // the same offset can be reused for other load/store pairs.
1141 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1142 CI.BaseOff = BaseOff * CI.EltSize;
1143 CI.Offset = EltOffset0 - BaseOff;
1144 Paired.Offset = EltOffset1 - BaseOff;
1145 }
1146 return true;
1147 }
1148
1149 return false;
1150}
1151
1152bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1153 const CombineInfo &CI,
1154 const CombineInfo &Paired) {
1155 const unsigned Width = (CI.Width + Paired.Width);
1156 switch (CI.InstClass) {
1157 default:
1158 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1159 case S_BUFFER_LOAD_IMM:
1160 case S_BUFFER_LOAD_SGPR_IMM:
1161 case S_LOAD_IMM:
1162 switch (Width) {
1163 default:
1164 return false;
1165 case 2:
1166 case 4:
1167 case 8:
1168 return true;
1169 case 3:
1170 return STM.hasScalarDwordx3Loads();
1171 }
1172 }
1173}
1174
1175const TargetRegisterClass *
1176SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1177 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1178 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1179 }
1180 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1181 return TRI->getRegClassForReg(*MRI, Src->getReg());
1182 }
1183 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1184 return TRI->getRegClassForReg(*MRI, Src->getReg());
1185 }
1186 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1187 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1188 }
1189 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1190 return TRI->getRegClassForReg(*MRI, Src->getReg());
1191 }
1192 return nullptr;
1193}
1194
1195/// This function assumes that CI comes before Paired in a basic block. Return
1196/// an insertion point for the merged instruction or nullptr on failure.
1197SILoadStoreOptimizer::CombineInfo *
1198SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1199 CombineInfo &Paired) {
1200 // If another instruction has already been merged into CI, it may now be a
1201 // type that we can't do any further merging into.
1202 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1203 return nullptr;
1204 assert(CI.InstClass == Paired.InstClass);
1205
1206 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1207 getInstSubclass(Paired.I->getOpcode(), *TII))
1208 return nullptr;
1209
1210 // Check both offsets (or masks for MIMG) can be combined and fit in the
1211 // reduced range.
1212 if (CI.InstClass == MIMG) {
1213 if (!dmasksCanBeCombined(CI, *TII, Paired))
1214 return nullptr;
1215 } else {
1216 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1217 return nullptr;
1218 }
1219
1220 DenseSet<Register> RegDefs;
1221 DenseSet<Register> RegUses;
1222 CombineInfo *Where;
1223 if (CI.I->mayLoad()) {
1224 // Try to hoist Paired up to CI.
1225 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1226 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1227 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1228 return nullptr;
1229 }
1230 Where = &CI;
1231 } else {
1232 // Try to sink CI down to Paired.
1233 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1234 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1235 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1236 return nullptr;
1237 }
1238 Where = &Paired;
1239 }
1240
1241 // Call offsetsCanBeCombined with modify = true so that the offsets are
1242 // correct for the new instruction. This should return true, because
1243 // this function should only be called on CombineInfo objects that
1244 // have already been confirmed to be mergeable.
1245 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1246 offsetsCanBeCombined(CI, *STM, Paired, true);
1247 return Where;
1248}
1249
1250// Copy the merged load result from DestReg to the original dest regs of CI and
1251// Paired.
1252void SILoadStoreOptimizer::copyToDestRegs(
1253 CombineInfo &CI, CombineInfo &Paired,
1254 MachineBasicBlock::iterator InsertBefore, int OpName,
1255 Register DestReg) const {
1256 MachineBasicBlock *MBB = CI.I->getParent();
1257 DebugLoc DL = CI.I->getDebugLoc();
1258
1259 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1260
1261 // Copy to the old destination registers.
1262 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1263 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1264 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1265
1266 // The constrained sload instructions in S_LOAD_IMM class will have
1267 // `early-clobber` flag in the dst operand. Remove the flag before using the
1268 // MOs in copies.
1269 Dest0->setIsEarlyClobber(false);
1270 Dest1->setIsEarlyClobber(false);
1271
1272 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1273 .add(*Dest0) // Copy to same destination including flags and sub reg.
1274 .addReg(DestReg, 0, SubRegIdx0);
1275 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1276 .add(*Dest1)
1277 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1278}
1279
1280// Return a register for the source of the merged store after copying the
1281// original source regs of CI and Paired into it.
1283SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1284 MachineBasicBlock::iterator InsertBefore,
1285 int OpName) const {
1286 MachineBasicBlock *MBB = CI.I->getParent();
1287 DebugLoc DL = CI.I->getDebugLoc();
1288
1289 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1290
1291 // Copy to the new source register.
1292 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1293 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1294
1295 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1296 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1297
1298 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1299 .add(*Src0)
1300 .addImm(SubRegIdx0)
1301 .add(*Src1)
1302 .addImm(SubRegIdx1);
1303
1304 return SrcReg;
1305}
1306
1307unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1308 if (STM->ldsRequiresM0Init())
1309 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1310 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1311}
1312
1313unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1314 if (STM->ldsRequiresM0Init())
1315 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1316
1317 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1318 : AMDGPU::DS_READ2ST64_B64_gfx9;
1319}
1320
1322SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1323 MachineBasicBlock::iterator InsertBefore) {
1324 MachineBasicBlock *MBB = CI.I->getParent();
1325
1326 // Be careful, since the addresses could be subregisters themselves in weird
1327 // cases, like vectors of pointers.
1328 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1329
1330 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1331 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1332 unsigned Opc =
1333 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1334
1335 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1336 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1337
1338 const MCInstrDesc &Read2Desc = TII->get(Opc);
1339
1340 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1341 Register DestReg = MRI->createVirtualRegister(SuperRC);
1342
1343 DebugLoc DL = CI.I->getDebugLoc();
1344
1345 Register BaseReg = AddrReg->getReg();
1346 unsigned BaseSubReg = AddrReg->getSubReg();
1347 unsigned BaseRegFlags = 0;
1348 if (CI.BaseOff) {
1349 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1350 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1351 .addImm(CI.BaseOff);
1352
1353 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1354 BaseRegFlags = RegState::Kill;
1355
1356 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1357 .addReg(ImmReg)
1358 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1359 .addImm(0); // clamp bit
1360 BaseSubReg = 0;
1361 }
1362
1363 MachineInstrBuilder Read2 =
1364 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1365 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1366 .addImm(NewOffset0) // offset0
1367 .addImm(NewOffset1) // offset1
1368 .addImm(0) // gds
1369 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1370
1371 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1372
1373 CI.I->eraseFromParent();
1374 Paired.I->eraseFromParent();
1375
1376 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1377 return Read2;
1378}
1379
1380unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1381 if (STM->ldsRequiresM0Init())
1382 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1383 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1384 : AMDGPU::DS_WRITE2_B64_gfx9;
1385}
1386
1387unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1388 if (STM->ldsRequiresM0Init())
1389 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1390 : AMDGPU::DS_WRITE2ST64_B64;
1391
1392 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1393 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1394}
1395
1396MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1397 CombineInfo &CI, CombineInfo &Paired,
1398 MachineBasicBlock::iterator InsertBefore) {
1399 MachineBasicBlock *MBB = CI.I->getParent();
1400
1401 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1402 // sure we preserve the subregister index and any register flags set on them.
1403 const MachineOperand *AddrReg =
1404 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1405 const MachineOperand *Data0 =
1406 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1407 const MachineOperand *Data1 =
1408 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1409
1410 unsigned NewOffset0 = CI.Offset;
1411 unsigned NewOffset1 = Paired.Offset;
1412 unsigned Opc =
1413 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1414
1415 if (NewOffset0 > NewOffset1) {
1416 // Canonicalize the merged instruction so the smaller offset comes first.
1417 std::swap(NewOffset0, NewOffset1);
1418 std::swap(Data0, Data1);
1419 }
1420
1421 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1422 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1423
1424 const MCInstrDesc &Write2Desc = TII->get(Opc);
1425 DebugLoc DL = CI.I->getDebugLoc();
1426
1427 Register BaseReg = AddrReg->getReg();
1428 unsigned BaseSubReg = AddrReg->getSubReg();
1429 unsigned BaseRegFlags = 0;
1430 if (CI.BaseOff) {
1431 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1432 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1433 .addImm(CI.BaseOff);
1434
1435 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1436 BaseRegFlags = RegState::Kill;
1437
1438 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1439 .addReg(ImmReg)
1440 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1441 .addImm(0); // clamp bit
1442 BaseSubReg = 0;
1443 }
1444
1445 MachineInstrBuilder Write2 =
1446 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1447 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1448 .add(*Data0) // data0
1449 .add(*Data1) // data1
1450 .addImm(NewOffset0) // offset0
1451 .addImm(NewOffset1) // offset1
1452 .addImm(0) // gds
1453 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1454
1455 CI.I->eraseFromParent();
1456 Paired.I->eraseFromParent();
1457
1458 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1459 return Write2;
1460}
1461
1463SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1464 MachineBasicBlock::iterator InsertBefore) {
1465 MachineBasicBlock *MBB = CI.I->getParent();
1466 DebugLoc DL = CI.I->getDebugLoc();
1467 const unsigned Opcode = getNewOpcode(CI, Paired);
1468
1469 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1470
1471 Register DestReg = MRI->createVirtualRegister(SuperRC);
1472 unsigned MergedDMask = CI.DMask | Paired.DMask;
1473 unsigned DMaskIdx =
1474 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1475
1476 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1477 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1478 if (I == DMaskIdx)
1479 MIB.addImm(MergedDMask);
1480 else
1481 MIB.add((*CI.I).getOperand(I));
1482 }
1483
1484 // It shouldn't be possible to get this far if the two instructions
1485 // don't have a single memoperand, because MachineInstr::mayAlias()
1486 // will return true if this is the case.
1487 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1488
1489 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1490
1491 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1492
1493 CI.I->eraseFromParent();
1494 Paired.I->eraseFromParent();
1495 return New;
1496}
1497
1498MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1499 CombineInfo &CI, CombineInfo &Paired,
1500 MachineBasicBlock::iterator InsertBefore) {
1501 MachineBasicBlock *MBB = CI.I->getParent();
1502 DebugLoc DL = CI.I->getDebugLoc();
1503 const unsigned Opcode = getNewOpcode(CI, Paired);
1504
1505 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1506
1507 Register DestReg = MRI->createVirtualRegister(SuperRC);
1508 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1509
1510 // It shouldn't be possible to get this far if the two instructions
1511 // don't have a single memoperand, because MachineInstr::mayAlias()
1512 // will return true if this is the case.
1513 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1514
1516 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1517 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1518 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1519 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1520 New.addImm(MergedOffset);
1521 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1522
1523 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1524
1525 CI.I->eraseFromParent();
1526 Paired.I->eraseFromParent();
1527 return New;
1528}
1529
1530MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1531 CombineInfo &CI, CombineInfo &Paired,
1532 MachineBasicBlock::iterator InsertBefore) {
1533 MachineBasicBlock *MBB = CI.I->getParent();
1534 DebugLoc DL = CI.I->getDebugLoc();
1535
1536 const unsigned Opcode = getNewOpcode(CI, Paired);
1537
1538 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1539
1540 // Copy to the new source register.
1541 Register DestReg = MRI->createVirtualRegister(SuperRC);
1542 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1543
1544 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1545
1546 AddressRegs Regs = getRegs(Opcode, *TII);
1547
1548 if (Regs.VAddr)
1549 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1550
1551 // It shouldn't be possible to get this far if the two instructions
1552 // don't have a single memoperand, because MachineInstr::mayAlias()
1553 // will return true if this is the case.
1554 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1555
1556 MachineInstr *New =
1557 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1558 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1559 .addImm(MergedOffset) // offset
1560 .addImm(CI.CPol) // cpol
1561 .addImm(0) // swz
1562 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1563
1564 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1565
1566 CI.I->eraseFromParent();
1567 Paired.I->eraseFromParent();
1568 return New;
1569}
1570
1571MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1572 CombineInfo &CI, CombineInfo &Paired,
1573 MachineBasicBlock::iterator InsertBefore) {
1574 MachineBasicBlock *MBB = CI.I->getParent();
1575 DebugLoc DL = CI.I->getDebugLoc();
1576
1577 const unsigned Opcode = getNewOpcode(CI, Paired);
1578
1579 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1580
1581 // Copy to the new source register.
1582 Register DestReg = MRI->createVirtualRegister(SuperRC);
1583 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1584
1585 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1586
1587 AddressRegs Regs = getRegs(Opcode, *TII);
1588
1589 if (Regs.VAddr)
1590 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1591
1592 unsigned JoinedFormat =
1593 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1594
1595 // It shouldn't be possible to get this far if the two instructions
1596 // don't have a single memoperand, because MachineInstr::mayAlias()
1597 // will return true if this is the case.
1598 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1599
1600 MachineInstr *New =
1601 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1602 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1603 .addImm(MergedOffset) // offset
1604 .addImm(JoinedFormat) // format
1605 .addImm(CI.CPol) // cpol
1606 .addImm(0) // swz
1607 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1608
1609 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1610
1611 CI.I->eraseFromParent();
1612 Paired.I->eraseFromParent();
1613 return New;
1614}
1615
1616MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1617 CombineInfo &CI, CombineInfo &Paired,
1618 MachineBasicBlock::iterator InsertBefore) {
1619 MachineBasicBlock *MBB = CI.I->getParent();
1620 DebugLoc DL = CI.I->getDebugLoc();
1621
1622 const unsigned Opcode = getNewOpcode(CI, Paired);
1623
1624 Register SrcReg =
1625 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1626
1627 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1628 .addReg(SrcReg, RegState::Kill);
1629
1630 AddressRegs Regs = getRegs(Opcode, *TII);
1631
1632 if (Regs.VAddr)
1633 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1634
1635 unsigned JoinedFormat =
1636 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1637
1638 // It shouldn't be possible to get this far if the two instructions
1639 // don't have a single memoperand, because MachineInstr::mayAlias()
1640 // will return true if this is the case.
1641 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1642
1643 MachineInstr *New =
1644 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1645 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1646 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1647 .addImm(JoinedFormat) // format
1648 .addImm(CI.CPol) // cpol
1649 .addImm(0) // swz
1650 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1651
1652 CI.I->eraseFromParent();
1653 Paired.I->eraseFromParent();
1654 return New;
1655}
1656
1657MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1658 CombineInfo &CI, CombineInfo &Paired,
1659 MachineBasicBlock::iterator InsertBefore) {
1660 MachineBasicBlock *MBB = CI.I->getParent();
1661 DebugLoc DL = CI.I->getDebugLoc();
1662
1663 const unsigned Opcode = getNewOpcode(CI, Paired);
1664
1665 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1666 Register DestReg = MRI->createVirtualRegister(SuperRC);
1667
1668 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1669
1670 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1671 MIB.add(*SAddr);
1672
1673 MachineInstr *New =
1674 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1675 .addImm(std::min(CI.Offset, Paired.Offset))
1676 .addImm(CI.CPol)
1677 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1678
1679 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1680
1681 CI.I->eraseFromParent();
1682 Paired.I->eraseFromParent();
1683 return New;
1684}
1685
1686MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1687 CombineInfo &CI, CombineInfo &Paired,
1688 MachineBasicBlock::iterator InsertBefore) {
1689 MachineBasicBlock *MBB = CI.I->getParent();
1690 DebugLoc DL = CI.I->getDebugLoc();
1691
1692 const unsigned Opcode = getNewOpcode(CI, Paired);
1693
1694 Register SrcReg =
1695 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1696
1697 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1698 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1699 .addReg(SrcReg, RegState::Kill);
1700
1701 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1702 MIB.add(*SAddr);
1703
1704 MachineInstr *New =
1705 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1706 .addImm(CI.CPol)
1707 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1708
1709 CI.I->eraseFromParent();
1710 Paired.I->eraseFromParent();
1711 return New;
1712}
1713
1716 unsigned Width) {
1717 // Conservatively returns true if not found the MMO.
1718 return STM.isXNACKEnabled() &&
1719 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1720}
1721
1722unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1723 const CombineInfo &Paired) {
1724 const unsigned Width = CI.Width + Paired.Width;
1725
1726 switch (getCommonInstClass(CI, Paired)) {
1727 default:
1728 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1729 // FIXME: Handle d16 correctly
1730 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1731 Width);
1732 case TBUFFER_LOAD:
1733 case TBUFFER_STORE:
1734 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1735 Width);
1736
1737 case UNKNOWN:
1738 llvm_unreachable("Unknown instruction class");
1739 case S_BUFFER_LOAD_IMM: {
1740 // If XNACK is enabled, use the constrained opcodes when the first load is
1741 // under-aligned.
1742 bool NeedsConstrainedOpc =
1743 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1744 switch (Width) {
1745 default:
1746 return 0;
1747 case 2:
1748 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1749 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1750 case 3:
1751 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1752 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1753 case 4:
1754 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1755 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1756 case 8:
1757 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1758 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1759 }
1760 }
1761 case S_BUFFER_LOAD_SGPR_IMM: {
1762 // If XNACK is enabled, use the constrained opcodes when the first load is
1763 // under-aligned.
1764 bool NeedsConstrainedOpc =
1765 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1766 switch (Width) {
1767 default:
1768 return 0;
1769 case 2:
1770 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1771 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1772 case 3:
1773 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1774 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1775 case 4:
1776 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1777 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1778 case 8:
1779 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1780 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1781 }
1782 }
1783 case S_LOAD_IMM: {
1784 // If XNACK is enabled, use the constrained opcodes when the first load is
1785 // under-aligned.
1786 bool NeedsConstrainedOpc =
1787 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1788 switch (Width) {
1789 default:
1790 return 0;
1791 case 2:
1792 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1793 : AMDGPU::S_LOAD_DWORDX2_IMM;
1794 case 3:
1795 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1796 : AMDGPU::S_LOAD_DWORDX3_IMM;
1797 case 4:
1798 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1799 : AMDGPU::S_LOAD_DWORDX4_IMM;
1800 case 8:
1801 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1802 : AMDGPU::S_LOAD_DWORDX8_IMM;
1803 }
1804 }
1805 case GLOBAL_LOAD:
1806 switch (Width) {
1807 default:
1808 return 0;
1809 case 2:
1810 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1811 case 3:
1812 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1813 case 4:
1814 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1815 }
1816 case GLOBAL_LOAD_SADDR:
1817 switch (Width) {
1818 default:
1819 return 0;
1820 case 2:
1821 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1822 case 3:
1823 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1824 case 4:
1825 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1826 }
1827 case GLOBAL_STORE:
1828 switch (Width) {
1829 default:
1830 return 0;
1831 case 2:
1832 return AMDGPU::GLOBAL_STORE_DWORDX2;
1833 case 3:
1834 return AMDGPU::GLOBAL_STORE_DWORDX3;
1835 case 4:
1836 return AMDGPU::GLOBAL_STORE_DWORDX4;
1837 }
1838 case GLOBAL_STORE_SADDR:
1839 switch (Width) {
1840 default:
1841 return 0;
1842 case 2:
1843 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1844 case 3:
1845 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1846 case 4:
1847 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1848 }
1849 case FLAT_LOAD:
1850 switch (Width) {
1851 default:
1852 return 0;
1853 case 2:
1854 return AMDGPU::FLAT_LOAD_DWORDX2;
1855 case 3:
1856 return AMDGPU::FLAT_LOAD_DWORDX3;
1857 case 4:
1858 return AMDGPU::FLAT_LOAD_DWORDX4;
1859 }
1860 case FLAT_STORE:
1861 switch (Width) {
1862 default:
1863 return 0;
1864 case 2:
1865 return AMDGPU::FLAT_STORE_DWORDX2;
1866 case 3:
1867 return AMDGPU::FLAT_STORE_DWORDX3;
1868 case 4:
1869 return AMDGPU::FLAT_STORE_DWORDX4;
1870 }
1871 case MIMG:
1872 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1873 "No overlaps");
1874 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1875 }
1876}
1877
1878std::pair<unsigned, unsigned>
1879SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1880 const CombineInfo &Paired) {
1881 assert((CI.InstClass != MIMG ||
1882 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1883 CI.Width + Paired.Width)) &&
1884 "No overlaps");
1885
1886 unsigned Idx0;
1887 unsigned Idx1;
1888
1889 static const unsigned Idxs[5][4] = {
1890 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1891 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1892 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1893 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1894 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1895 };
1896
1897 assert(CI.Width >= 1 && CI.Width <= 4);
1898 assert(Paired.Width >= 1 && Paired.Width <= 4);
1899
1900 if (Paired < CI) {
1901 Idx1 = Idxs[0][Paired.Width - 1];
1902 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1903 } else {
1904 Idx0 = Idxs[0][CI.Width - 1];
1905 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1906 }
1907
1908 return {Idx0, Idx1};
1909}
1910
1911const TargetRegisterClass *
1912SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1913 const CombineInfo &Paired) const {
1914 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1915 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1916 switch (CI.Width + Paired.Width) {
1917 default:
1918 return nullptr;
1919 case 2:
1920 return &AMDGPU::SReg_64_XEXECRegClass;
1921 case 3:
1922 return &AMDGPU::SGPR_96RegClass;
1923 case 4:
1924 return &AMDGPU::SGPR_128RegClass;
1925 case 8:
1926 return &AMDGPU::SGPR_256RegClass;
1927 case 16:
1928 return &AMDGPU::SGPR_512RegClass;
1929 }
1930 }
1931
1932 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1933 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1934 ? TRI->getAGPRClassForBitWidth(BitWidth)
1935 : TRI->getVGPRClassForBitWidth(BitWidth);
1936}
1937
1938MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1939 CombineInfo &CI, CombineInfo &Paired,
1940 MachineBasicBlock::iterator InsertBefore) {
1941 MachineBasicBlock *MBB = CI.I->getParent();
1942 DebugLoc DL = CI.I->getDebugLoc();
1943
1944 const unsigned Opcode = getNewOpcode(CI, Paired);
1945
1946 Register SrcReg =
1947 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1948
1949 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1950 .addReg(SrcReg, RegState::Kill);
1951
1952 AddressRegs Regs = getRegs(Opcode, *TII);
1953
1954 if (Regs.VAddr)
1955 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1956
1957
1958 // It shouldn't be possible to get this far if the two instructions
1959 // don't have a single memoperand, because MachineInstr::mayAlias()
1960 // will return true if this is the case.
1961 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1962
1963 MachineInstr *New =
1964 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1965 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1966 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1967 .addImm(CI.CPol) // cpol
1968 .addImm(0) // swz
1969 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1970
1971 CI.I->eraseFromParent();
1972 Paired.I->eraseFromParent();
1973 return New;
1974}
1975
1977SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1978 APInt V(32, Val, true);
1979 if (TII->isInlineConstant(V))
1980 return MachineOperand::CreateImm(Val);
1981
1982 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1983 MachineInstr *Mov =
1984 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1985 TII->get(AMDGPU::S_MOV_B32), Reg)
1986 .addImm(Val);
1987 (void)Mov;
1988 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1989 return MachineOperand::CreateReg(Reg, false);
1990}
1991
1992// Compute base address using Addr and return the final register.
1993Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1994 const MemAddress &Addr) const {
1995 MachineBasicBlock *MBB = MI.getParent();
1996 MachineBasicBlock::iterator MBBI = MI.getIterator();
1997 DebugLoc DL = MI.getDebugLoc();
1998
1999 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2000 Addr.Base.LoSubReg) &&
2001 "Expected 32-bit Base-Register-Low!!");
2002
2003 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2004 Addr.Base.HiSubReg) &&
2005 "Expected 32-bit Base-Register-Hi!!");
2006
2007 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
2008 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
2009 MachineOperand OffsetHi =
2010 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
2011
2012 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
2013 Register CarryReg = MRI->createVirtualRegister(CarryRC);
2014 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2015
2016 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2017 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2018 MachineInstr *LoHalf =
2019 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2020 .addReg(CarryReg, RegState::Define)
2021 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2022 .add(OffsetLo)
2023 .addImm(0); // clamp bit
2024 (void)LoHalf;
2025 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
2026
2027 MachineInstr *HiHalf =
2028 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2029 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2030 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2031 .add(OffsetHi)
2032 .addReg(CarryReg, RegState::Kill)
2033 .addImm(0); // clamp bit
2034 (void)HiHalf;
2035 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
2036
2037 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2038 MachineInstr *FullBase =
2039 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2040 .addReg(DestSub0)
2041 .addImm(AMDGPU::sub0)
2042 .addReg(DestSub1)
2043 .addImm(AMDGPU::sub1);
2044 (void)FullBase;
2045 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
2046
2047 return FullDestReg;
2048}
2049
2050// Update base and offset with the NewBase and NewOffset in MI.
2051void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2052 Register NewBase,
2053 int32_t NewOffset) const {
2054 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2055 Base->setReg(NewBase);
2056 Base->setIsKill(false);
2057 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2058}
2059
2060std::optional<int32_t>
2061SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2062 if (Op.isImm())
2063 return Op.getImm();
2064
2065 if (!Op.isReg())
2066 return std::nullopt;
2067
2068 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2069 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2070 !Def->getOperand(1).isImm())
2071 return std::nullopt;
2072
2073 return Def->getOperand(1).getImm();
2074}
2075
2076// Analyze Base and extracts:
2077// - 32bit base registers, subregisters
2078// - 64bit constant offset
2079// Expecting base computation as:
2080// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2081// %LO:vgpr_32, %c:sreg_64_xexec =
2082// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2083// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2084// %Base:vreg_64 =
2085// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2086void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2087 MemAddress &Addr) const {
2088 if (!Base.isReg())
2089 return;
2090
2091 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2092 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2093 || Def->getNumOperands() != 5)
2094 return;
2095
2096 MachineOperand BaseLo = Def->getOperand(1);
2097 MachineOperand BaseHi = Def->getOperand(3);
2098 if (!BaseLo.isReg() || !BaseHi.isReg())
2099 return;
2100
2101 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2102 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2103
2104 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2105 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2106 return;
2107
2108 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2109 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2110
2111 auto Offset0P = extractConstOffset(*Src0);
2112 if (Offset0P)
2113 BaseLo = *Src1;
2114 else {
2115 if (!(Offset0P = extractConstOffset(*Src1)))
2116 return;
2117 BaseLo = *Src0;
2118 }
2119
2120 if (!BaseLo.isReg())
2121 return;
2122
2123 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2124 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2125
2126 if (Src0->isImm())
2127 std::swap(Src0, Src1);
2128
2129 if (!Src1->isImm() || Src0->isImm())
2130 return;
2131
2132 uint64_t Offset1 = Src1->getImm();
2133 BaseHi = *Src0;
2134
2135 if (!BaseHi.isReg())
2136 return;
2137
2138 Addr.Base.LoReg = BaseLo.getReg();
2139 Addr.Base.HiReg = BaseHi.getReg();
2140 Addr.Base.LoSubReg = BaseLo.getSubReg();
2141 Addr.Base.HiSubReg = BaseHi.getSubReg();
2142 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2143}
2144
2145bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2147 MemInfoMap &Visited,
2149
2150 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2151 return false;
2152
2153 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2155 return false;
2156
2159
2160 if (AnchorList.count(&MI))
2161 return false;
2162
2163 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2164
2165 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2166 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2167 return false;
2168 }
2169
2170 // Step1: Find the base-registers and a 64bit constant offset.
2171 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2172 MemAddress MAddr;
2173 if (!Visited.contains(&MI)) {
2174 processBaseWithConstOffset(Base, MAddr);
2175 Visited[&MI] = MAddr;
2176 } else
2177 MAddr = Visited[&MI];
2178
2179 if (MAddr.Offset == 0) {
2180 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2181 " constant offsets that can be promoted.\n";);
2182 return false;
2183 }
2184
2185 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2186 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2187
2188 // Step2: Traverse through MI's basic block and find an anchor(that has the
2189 // same base-registers) with the highest 13bit distance from MI's offset.
2190 // E.g. (64bit loads)
2191 // bb:
2192 // addr1 = &a + 4096; load1 = load(addr1, 0)
2193 // addr2 = &a + 6144; load2 = load(addr2, 0)
2194 // addr3 = &a + 8192; load3 = load(addr3, 0)
2195 // addr4 = &a + 10240; load4 = load(addr4, 0)
2196 // addr5 = &a + 12288; load5 = load(addr5, 0)
2197 //
2198 // Starting from the first load, the optimization will try to find a new base
2199 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2200 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2201 // as the new-base(anchor) because of the maximum distance which can
2202 // accommodate more intermediate bases presumably.
2203 //
2204 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2205 // (&a + 8192) for load1, load2, load4.
2206 // addr = &a + 8192
2207 // load1 = load(addr, -4096)
2208 // load2 = load(addr, -2048)
2209 // load3 = load(addr, 0)
2210 // load4 = load(addr, 2048)
2211 // addr5 = &a + 12288; load5 = load(addr5, 0)
2212 //
2213 MachineInstr *AnchorInst = nullptr;
2214 MemAddress AnchorAddr;
2215 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2217
2218 MachineBasicBlock *MBB = MI.getParent();
2220 MachineBasicBlock::iterator MBBI = MI.getIterator();
2221 ++MBBI;
2222 const SITargetLowering *TLI =
2223 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2224
2225 for ( ; MBBI != E; ++MBBI) {
2226 MachineInstr &MINext = *MBBI;
2227 // TODO: Support finding an anchor(with same base) from store addresses or
2228 // any other load addresses where the opcodes are different.
2229 if (MINext.getOpcode() != MI.getOpcode() ||
2230 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2231 continue;
2232
2233 const MachineOperand &BaseNext =
2234 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2235 MemAddress MAddrNext;
2236 if (!Visited.contains(&MINext)) {
2237 processBaseWithConstOffset(BaseNext, MAddrNext);
2238 Visited[&MINext] = MAddrNext;
2239 } else
2240 MAddrNext = Visited[&MINext];
2241
2242 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2243 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2244 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2245 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2246 continue;
2247
2248 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2249
2250 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2252 AM.HasBaseReg = true;
2253 AM.BaseOffs = Dist;
2254 if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2255 (uint32_t)std::abs(Dist) > MaxDist) {
2256 MaxDist = std::abs(Dist);
2257
2258 AnchorAddr = MAddrNext;
2259 AnchorInst = &MINext;
2260 }
2261 }
2262
2263 if (AnchorInst) {
2264 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2265 AnchorInst->dump());
2266 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2267 << AnchorAddr.Offset << "\n\n");
2268
2269 // Instead of moving up, just re-compute anchor-instruction's base address.
2270 Register Base = computeBase(MI, AnchorAddr);
2271
2272 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2273 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2274
2275 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2277 AM.HasBaseReg = true;
2278 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2279
2280 if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2281 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2282 OtherMI->dump());
2283 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2284 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2285 }
2286 }
2287 AnchorList.insert(AnchorInst);
2288 return true;
2289 }
2290
2291 return false;
2292}
2293
2294void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2295 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2296 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2297 if (AddrList.front().InstClass == CI.InstClass &&
2298 AddrList.front().IsAGPR == CI.IsAGPR &&
2299 AddrList.front().hasSameBaseAddress(CI)) {
2300 AddrList.emplace_back(CI);
2301 return;
2302 }
2303 }
2304
2305 // Base address not found, so add a new list.
2306 MergeableInsts.emplace_back(1, CI);
2307}
2308
2309std::pair<MachineBasicBlock::iterator, bool>
2310SILoadStoreOptimizer::collectMergeableInsts(
2312 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2313 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2314 bool Modified = false;
2315
2316 // Sort potential mergeable instructions into lists. One list per base address.
2317 unsigned Order = 0;
2318 MachineBasicBlock::iterator BlockI = Begin;
2319 for (; BlockI != End; ++BlockI) {
2320 MachineInstr &MI = *BlockI;
2321
2322 // We run this before checking if an address is mergeable, because it can produce
2323 // better code even if the instructions aren't mergeable.
2324 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2325 Modified = true;
2326
2327 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2328 // barriers. We can look after this barrier for separate merges.
2329 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2330 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2331
2332 // Search will resume after this instruction in a separate merge list.
2333 ++BlockI;
2334 break;
2335 }
2336
2337 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2338 if (InstClass == UNKNOWN)
2339 continue;
2340
2341 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2342 int Swizzled =
2343 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2344 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2345 continue;
2346
2347 CombineInfo CI;
2348 CI.setMI(MI, *this);
2349 CI.Order = Order++;
2350
2351 if (!CI.hasMergeableAddress(*MRI))
2352 continue;
2353
2354 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2355 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2356 // operands. However we are reporting that ds_write2 shall have
2357 // only VGPR data so that machine copy propagation does not
2358 // create an illegal instruction with a VGPR and AGPR sources.
2359 // Consequenctially if we create such instruction the verifier
2360 // will complain.
2361 continue;
2362 }
2363
2364 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2365
2366 addInstToMergeableList(CI, MergeableInsts);
2367 }
2368
2369 // At this point we have lists of Mergeable instructions.
2370 //
2371 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2372 // list try to find an instruction that can be merged with I. If an instruction
2373 // is found, it is stored in the Paired field. If no instructions are found, then
2374 // the CombineInfo object is deleted from the list.
2375
2376 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2377 E = MergeableInsts.end(); I != E;) {
2378
2379 std::list<CombineInfo> &MergeList = *I;
2380 if (MergeList.size() <= 1) {
2381 // This means we have found only one instruction with a given address
2382 // that can be merged, and we need at least 2 instructions to do a merge,
2383 // so this list can be discarded.
2384 I = MergeableInsts.erase(I);
2385 continue;
2386 }
2387
2388 // Sort the lists by offsets, this way mergeable instructions will be
2389 // adjacent to each other in the list, which will make it easier to find
2390 // matches.
2391 MergeList.sort(
2392 [] (const CombineInfo &A, const CombineInfo &B) {
2393 return A.Offset < B.Offset;
2394 });
2395 ++I;
2396 }
2397
2398 return {BlockI, Modified};
2399}
2400
2401// Scan through looking for adjacent LDS operations with constant offsets from
2402// the same base register. We rely on the scheduler to do the hard work of
2403// clustering nearby loads, and assume these are all adjacent.
2404bool SILoadStoreOptimizer::optimizeBlock(
2405 std::list<std::list<CombineInfo> > &MergeableInsts) {
2406 bool Modified = false;
2407
2408 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2409 E = MergeableInsts.end(); I != E;) {
2410 std::list<CombineInfo> &MergeList = *I;
2411
2412 bool OptimizeListAgain = false;
2413 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2414 // We weren't able to make any changes, so delete the list so we don't
2415 // process the same instructions the next time we try to optimize this
2416 // block.
2417 I = MergeableInsts.erase(I);
2418 continue;
2419 }
2420
2421 Modified = true;
2422
2423 // We made changes, but also determined that there were no more optimization
2424 // opportunities, so we don't need to reprocess the list
2425 if (!OptimizeListAgain) {
2426 I = MergeableInsts.erase(I);
2427 continue;
2428 }
2429 OptimizeAgain = true;
2430 }
2431 return Modified;
2432}
2433
2434bool
2435SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2436 std::list<CombineInfo> &MergeList,
2437 bool &OptimizeListAgain) {
2438 if (MergeList.empty())
2439 return false;
2440
2441 bool Modified = false;
2442
2443 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2444 Next = std::next(I)) {
2445
2446 auto First = I;
2447 auto Second = Next;
2448
2449 if ((*First).Order > (*Second).Order)
2450 std::swap(First, Second);
2451 CombineInfo &CI = *First;
2452 CombineInfo &Paired = *Second;
2453
2454 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2455 if (!Where) {
2456 ++I;
2457 continue;
2458 }
2459
2460 Modified = true;
2461
2462 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2463
2465 switch (CI.InstClass) {
2466 default:
2467 llvm_unreachable("unknown InstClass");
2468 break;
2469 case DS_READ:
2470 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2471 break;
2472 case DS_WRITE:
2473 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2474 break;
2475 case S_BUFFER_LOAD_IMM:
2476 case S_BUFFER_LOAD_SGPR_IMM:
2477 case S_LOAD_IMM:
2478 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2479 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2480 break;
2481 case BUFFER_LOAD:
2482 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2483 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2484 break;
2485 case BUFFER_STORE:
2486 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2487 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2488 break;
2489 case MIMG:
2490 NewMI = mergeImagePair(CI, Paired, Where->I);
2491 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2492 break;
2493 case TBUFFER_LOAD:
2494 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2495 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2496 break;
2497 case TBUFFER_STORE:
2498 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2499 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2500 break;
2501 case FLAT_LOAD:
2502 case GLOBAL_LOAD:
2503 case GLOBAL_LOAD_SADDR:
2504 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2505 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2506 break;
2507 case FLAT_STORE:
2508 case GLOBAL_STORE:
2509 case GLOBAL_STORE_SADDR:
2510 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2511 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2512 break;
2513 }
2514 CI.setMI(NewMI, *this);
2515 CI.Order = Where->Order;
2516 if (I == Second)
2517 I = Next;
2518
2519 MergeList.erase(Second);
2520 }
2521
2522 return Modified;
2523}
2524
2525bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2526 if (skipFunction(MF.getFunction()))
2527 return false;
2528
2529 STM = &MF.getSubtarget<GCNSubtarget>();
2530 if (!STM->loadStoreOptEnabled())
2531 return false;
2532
2533 TII = STM->getInstrInfo();
2534 TRI = &TII->getRegisterInfo();
2535
2536 MRI = &MF.getRegInfo();
2537 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2538
2539 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2540
2541 bool Modified = false;
2542
2543 // Contains the list of instructions for which constant offsets are being
2544 // promoted to the IMM. This is tracked for an entire block at time.
2546 MemInfoMap Visited;
2547
2548 for (MachineBasicBlock &MBB : MF) {
2549 MachineBasicBlock::iterator SectionEnd;
2550 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2551 I = SectionEnd) {
2552 bool CollectModified;
2553 std::list<std::list<CombineInfo>> MergeableInsts;
2554
2555 // First pass: Collect list of all instructions we know how to merge in a
2556 // subset of the block.
2557 std::tie(SectionEnd, CollectModified) =
2558 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2559
2560 Modified |= CollectModified;
2561
2562 do {
2563 OptimizeAgain = false;
2564 Modified |= optimizeBlock(MergeableInsts);
2565 } while (OptimizeAgain);
2566 }
2567
2568 Visited.clear();
2569 AnchorList.clear();
2570 }
2571
2572 return Modified;
2573}
unsigned const MachineRegisterInfo * MRI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1309
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
#define op(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
SI Load Store Optimizer
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
#define DEBUG_TYPE
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition: APInt.h:78
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:627
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:266
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:274
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:702
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:605
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
void dump() const
Definition: Pass.cpp:136
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:361
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...