LLVM 20.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
61#include "AMDGPU.h"
62#include "GCNSubtarget.h"
67
68using namespace llvm;
69
70#define DEBUG_TYPE "si-load-store-opt"
71
72namespace {
73enum InstClassEnum {
74 UNKNOWN,
75 DS_READ,
76 DS_WRITE,
77 S_BUFFER_LOAD_IMM,
78 S_BUFFER_LOAD_SGPR_IMM,
79 S_LOAD_IMM,
80 BUFFER_LOAD,
81 BUFFER_STORE,
82 MIMG,
83 TBUFFER_LOAD,
84 TBUFFER_STORE,
85 GLOBAL_LOAD_SADDR,
86 GLOBAL_STORE_SADDR,
87 FLAT_LOAD,
88 FLAT_STORE,
89 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
90 GLOBAL_STORE // any CombineInfo, they are only ever returned by
91 // getCommonInstClass.
92};
93
94struct AddressRegs {
95 unsigned char NumVAddrs = 0;
96 bool SBase = false;
97 bool SRsrc = false;
98 bool SOffset = false;
99 bool SAddr = false;
100 bool VAddr = false;
101 bool Addr = false;
102 bool SSamp = false;
103};
104
105// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
106const unsigned MaxAddressRegs = 12 + 1 + 1;
107
108class SILoadStoreOptimizer {
109 struct CombineInfo {
111 unsigned EltSize;
112 unsigned Offset;
113 unsigned Width;
114 unsigned Format;
115 unsigned BaseOff;
116 unsigned DMask;
117 InstClassEnum InstClass;
118 unsigned CPol = 0;
119 bool IsAGPR;
120 bool UseST64;
121 int AddrIdx[MaxAddressRegs];
122 const MachineOperand *AddrReg[MaxAddressRegs];
123 unsigned NumAddresses;
124 unsigned Order;
125
126 bool hasSameBaseAddress(const CombineInfo &CI) {
127 if (NumAddresses != CI.NumAddresses)
128 return false;
129
130 const MachineInstr &MI = *CI.I;
131 for (unsigned i = 0; i < NumAddresses; i++) {
132 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
133
134 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
135 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
136 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
137 return false;
138 }
139 continue;
140 }
141
142 // Check same base pointer. Be careful of subregisters, which can occur
143 // with vectors of pointers.
144 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
145 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
146 return false;
147 }
148 }
149 return true;
150 }
151
152 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
153 for (unsigned i = 0; i < NumAddresses; ++i) {
154 const MachineOperand *AddrOp = AddrReg[i];
155 // Immediates are always OK.
156 if (AddrOp->isImm())
157 continue;
158
159 // Don't try to merge addresses that aren't either immediates or registers.
160 // TODO: Should be possible to merge FrameIndexes and maybe some other
161 // non-register
162 if (!AddrOp->isReg())
163 return false;
164
165 // TODO: We should be able to merge instructions with other physical reg
166 // addresses too.
167 if (AddrOp->getReg().isPhysical() &&
168 AddrOp->getReg() != AMDGPU::SGPR_NULL)
169 return false;
170
171 // If an address has only one use then there will be no other
172 // instructions with the same address, so we can't merge this one.
173 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
174 return false;
175 }
176 return true;
177 }
178
179 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
180
181 // Compare by pointer order.
182 bool operator<(const CombineInfo& Other) const {
183 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
184 }
185 };
186
187 struct BaseRegisters {
188 Register LoReg;
189 Register HiReg;
190
191 unsigned LoSubReg = 0;
192 unsigned HiSubReg = 0;
193 };
194
195 struct MemAddress {
196 BaseRegisters Base;
197 int64_t Offset = 0;
198 };
199
200 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
201
202private:
203 const GCNSubtarget *STM = nullptr;
204 const SIInstrInfo *TII = nullptr;
205 const SIRegisterInfo *TRI = nullptr;
206 MachineRegisterInfo *MRI = nullptr;
207 AliasAnalysis *AA = nullptr;
208 bool OptimizeAgain;
209
210 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
211 const DenseSet<Register> &ARegUses,
212 const MachineInstr &A, const MachineInstr &B) const;
213 static bool dmasksCanBeCombined(const CombineInfo &CI,
214 const SIInstrInfo &TII,
215 const CombineInfo &Paired);
216 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
217 CombineInfo &Paired, bool Modify = false);
218 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
219 const CombineInfo &Paired);
220 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
221 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
222 const CombineInfo &Paired);
223 const TargetRegisterClass *
224 getTargetRegisterClass(const CombineInfo &CI,
225 const CombineInfo &Paired) const;
226 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
227
228 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
229
230 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
231 MachineBasicBlock::iterator InsertBefore, int OpName,
232 Register DestReg) const;
233 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
234 MachineBasicBlock::iterator InsertBefore,
235 int OpName) const;
236
237 unsigned read2Opcode(unsigned EltSize) const;
238 unsigned read2ST64Opcode(unsigned EltSize) const;
240 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
241 MachineBasicBlock::iterator InsertBefore);
242
243 unsigned write2Opcode(unsigned EltSize) const;
244 unsigned write2ST64Opcode(unsigned EltSize) const;
246 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
247 MachineBasicBlock::iterator InsertBefore);
249 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
250 MachineBasicBlock::iterator InsertBefore);
252 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
253 MachineBasicBlock::iterator InsertBefore);
255 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
256 MachineBasicBlock::iterator InsertBefore);
258 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
259 MachineBasicBlock::iterator InsertBefore);
261 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
262 MachineBasicBlock::iterator InsertBefore);
264 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
265 MachineBasicBlock::iterator InsertBefore);
267 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
268 MachineBasicBlock::iterator InsertBefore);
270 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
271 MachineBasicBlock::iterator InsertBefore);
272
273 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
274 int32_t NewOffset) const;
275 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
276 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
277 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
278 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
279 /// Promotes constant offset to the immediate by adjusting the base. It
280 /// tries to use a base from the nearby instructions that allows it to have
281 /// a 13bit constant offset which gets promoted to the immediate.
282 bool promoteConstantOffsetToImm(MachineInstr &CI,
283 MemInfoMap &Visited,
284 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
285 void addInstToMergeableList(const CombineInfo &CI,
286 std::list<std::list<CombineInfo> > &MergeableInsts) const;
287
288 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
290 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
291 std::list<std::list<CombineInfo>> &MergeableInsts) const;
292
293 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
294 const CombineInfo &Paired);
295
296 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
297 const CombineInfo &Paired);
298
299 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
300 bool &OptimizeListAgain);
301 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
302
303public:
304 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
305 bool run(MachineFunction &MF);
306};
307
308class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
309public:
310 static char ID;
311
312 SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
313
314 bool runOnMachineFunction(MachineFunction &MF) override;
315
316 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
317
318 void getAnalysisUsage(AnalysisUsage &AU) const override {
319 AU.setPreservesCFG();
321
323 }
324
327 .set(MachineFunctionProperties::Property::IsSSA);
328 }
329};
330
331static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
332 const unsigned Opc = MI.getOpcode();
333
334 if (TII.isMUBUF(Opc)) {
335 // FIXME: Handle d16 correctly
336 return AMDGPU::getMUBUFElements(Opc);
337 }
338 if (TII.isImage(MI)) {
339 uint64_t DMaskImm =
340 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
341 return llvm::popcount(DMaskImm);
342 }
343 if (TII.isMTBUF(Opc)) {
344 return AMDGPU::getMTBUFElements(Opc);
345 }
346
347 switch (Opc) {
348 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
349 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
350 case AMDGPU::S_LOAD_DWORD_IMM:
351 case AMDGPU::GLOBAL_LOAD_DWORD:
352 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
353 case AMDGPU::GLOBAL_STORE_DWORD:
354 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
355 case AMDGPU::FLAT_LOAD_DWORD:
356 case AMDGPU::FLAT_STORE_DWORD:
357 return 1;
358 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
359 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
360 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
361 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
362 case AMDGPU::S_LOAD_DWORDX2_IMM:
363 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
364 case AMDGPU::GLOBAL_LOAD_DWORDX2:
365 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
366 case AMDGPU::GLOBAL_STORE_DWORDX2:
367 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
368 case AMDGPU::FLAT_LOAD_DWORDX2:
369 case AMDGPU::FLAT_STORE_DWORDX2:
370 return 2;
371 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
372 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
373 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
375 case AMDGPU::S_LOAD_DWORDX3_IMM:
376 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
377 case AMDGPU::GLOBAL_LOAD_DWORDX3:
378 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
379 case AMDGPU::GLOBAL_STORE_DWORDX3:
380 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
381 case AMDGPU::FLAT_LOAD_DWORDX3:
382 case AMDGPU::FLAT_STORE_DWORDX3:
383 return 3;
384 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
385 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
386 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
387 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
388 case AMDGPU::S_LOAD_DWORDX4_IMM:
389 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
390 case AMDGPU::GLOBAL_LOAD_DWORDX4:
391 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
392 case AMDGPU::GLOBAL_STORE_DWORDX4:
393 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
394 case AMDGPU::FLAT_LOAD_DWORDX4:
395 case AMDGPU::FLAT_STORE_DWORDX4:
396 return 4;
397 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
398 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
399 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
400 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
401 case AMDGPU::S_LOAD_DWORDX8_IMM:
402 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
403 return 8;
404 case AMDGPU::DS_READ_B32:
405 case AMDGPU::DS_READ_B32_gfx9:
406 case AMDGPU::DS_WRITE_B32:
407 case AMDGPU::DS_WRITE_B32_gfx9:
408 return 1;
409 case AMDGPU::DS_READ_B64:
410 case AMDGPU::DS_READ_B64_gfx9:
411 case AMDGPU::DS_WRITE_B64:
412 case AMDGPU::DS_WRITE_B64_gfx9:
413 return 2;
414 default:
415 return 0;
416 }
417}
418
419/// Maps instruction opcode to enum InstClassEnum.
420static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
421 switch (Opc) {
422 default:
423 if (TII.isMUBUF(Opc)) {
424 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
425 default:
426 return UNKNOWN;
427 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
428 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
429 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
430 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
431 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
432 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
433 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
434 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
435 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
436 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
437 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
438 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
439 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
440 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
441 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
442 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
443 return BUFFER_LOAD;
444 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
445 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
446 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
447 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
448 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
449 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
450 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
451 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
452 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
453 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
454 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
455 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
456 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
457 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
458 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
459 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
460 return BUFFER_STORE;
461 }
462 }
463 if (TII.isImage(Opc)) {
464 // Ignore instructions encoded without vaddr.
465 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
466 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
467 return UNKNOWN;
468 // Ignore BVH instructions
469 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
470 return UNKNOWN;
471 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
472 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
473 TII.isGather4(Opc))
474 return UNKNOWN;
475 return MIMG;
476 }
477 if (TII.isMTBUF(Opc)) {
478 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
479 default:
480 return UNKNOWN;
481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
484 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
485 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
486 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
487 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
488 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
489 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
490 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
491 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
492 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
493 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
494 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
496 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
497 return TBUFFER_LOAD;
498 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
499 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
500 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
501 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
502 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
503 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
504 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
505 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
506 return TBUFFER_STORE;
507 }
508 }
509 return UNKNOWN;
510 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
511 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
512 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
513 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
514 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
515 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
516 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
517 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
518 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
519 return S_BUFFER_LOAD_IMM;
520 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
521 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
522 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
523 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
524 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
525 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
529 return S_BUFFER_LOAD_SGPR_IMM;
530 case AMDGPU::S_LOAD_DWORD_IMM:
531 case AMDGPU::S_LOAD_DWORDX2_IMM:
532 case AMDGPU::S_LOAD_DWORDX3_IMM:
533 case AMDGPU::S_LOAD_DWORDX4_IMM:
534 case AMDGPU::S_LOAD_DWORDX8_IMM:
535 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
536 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
537 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
538 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
539 return S_LOAD_IMM;
540 case AMDGPU::DS_READ_B32:
541 case AMDGPU::DS_READ_B32_gfx9:
542 case AMDGPU::DS_READ_B64:
543 case AMDGPU::DS_READ_B64_gfx9:
544 return DS_READ;
545 case AMDGPU::DS_WRITE_B32:
546 case AMDGPU::DS_WRITE_B32_gfx9:
547 case AMDGPU::DS_WRITE_B64:
548 case AMDGPU::DS_WRITE_B64_gfx9:
549 return DS_WRITE;
550 case AMDGPU::GLOBAL_LOAD_DWORD:
551 case AMDGPU::GLOBAL_LOAD_DWORDX2:
552 case AMDGPU::GLOBAL_LOAD_DWORDX3:
553 case AMDGPU::GLOBAL_LOAD_DWORDX4:
554 case AMDGPU::FLAT_LOAD_DWORD:
555 case AMDGPU::FLAT_LOAD_DWORDX2:
556 case AMDGPU::FLAT_LOAD_DWORDX3:
557 case AMDGPU::FLAT_LOAD_DWORDX4:
558 return FLAT_LOAD;
559 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
560 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
561 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
562 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
563 return GLOBAL_LOAD_SADDR;
564 case AMDGPU::GLOBAL_STORE_DWORD:
565 case AMDGPU::GLOBAL_STORE_DWORDX2:
566 case AMDGPU::GLOBAL_STORE_DWORDX3:
567 case AMDGPU::GLOBAL_STORE_DWORDX4:
568 case AMDGPU::FLAT_STORE_DWORD:
569 case AMDGPU::FLAT_STORE_DWORDX2:
570 case AMDGPU::FLAT_STORE_DWORDX3:
571 case AMDGPU::FLAT_STORE_DWORDX4:
572 return FLAT_STORE;
573 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
574 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
575 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
576 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
577 return GLOBAL_STORE_SADDR;
578 }
579}
580
581/// Determines instruction subclass from opcode. Only instructions
582/// of the same subclass can be merged together. The merged instruction may have
583/// a different subclass but must have the same class.
584static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
585 switch (Opc) {
586 default:
587 if (TII.isMUBUF(Opc))
588 return AMDGPU::getMUBUFBaseOpcode(Opc);
589 if (TII.isImage(Opc)) {
591 assert(Info);
592 return Info->BaseOpcode;
593 }
594 if (TII.isMTBUF(Opc))
595 return AMDGPU::getMTBUFBaseOpcode(Opc);
596 return -1;
597 case AMDGPU::DS_READ_B32:
598 case AMDGPU::DS_READ_B32_gfx9:
599 case AMDGPU::DS_READ_B64:
600 case AMDGPU::DS_READ_B64_gfx9:
601 case AMDGPU::DS_WRITE_B32:
602 case AMDGPU::DS_WRITE_B32_gfx9:
603 case AMDGPU::DS_WRITE_B64:
604 case AMDGPU::DS_WRITE_B64_gfx9:
605 return Opc;
606 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
607 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
608 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
609 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
610 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
611 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
612 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
613 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
614 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
615 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
616 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
617 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
618 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
619 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
620 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
621 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
622 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
623 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
624 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
625 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
626 case AMDGPU::S_LOAD_DWORD_IMM:
627 case AMDGPU::S_LOAD_DWORDX2_IMM:
628 case AMDGPU::S_LOAD_DWORDX3_IMM:
629 case AMDGPU::S_LOAD_DWORDX4_IMM:
630 case AMDGPU::S_LOAD_DWORDX8_IMM:
631 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
632 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
633 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
634 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
635 return AMDGPU::S_LOAD_DWORD_IMM;
636 case AMDGPU::GLOBAL_LOAD_DWORD:
637 case AMDGPU::GLOBAL_LOAD_DWORDX2:
638 case AMDGPU::GLOBAL_LOAD_DWORDX3:
639 case AMDGPU::GLOBAL_LOAD_DWORDX4:
640 case AMDGPU::FLAT_LOAD_DWORD:
641 case AMDGPU::FLAT_LOAD_DWORDX2:
642 case AMDGPU::FLAT_LOAD_DWORDX3:
643 case AMDGPU::FLAT_LOAD_DWORDX4:
644 return AMDGPU::FLAT_LOAD_DWORD;
645 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
646 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
647 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
648 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
649 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
650 case AMDGPU::GLOBAL_STORE_DWORD:
651 case AMDGPU::GLOBAL_STORE_DWORDX2:
652 case AMDGPU::GLOBAL_STORE_DWORDX3:
653 case AMDGPU::GLOBAL_STORE_DWORDX4:
654 case AMDGPU::FLAT_STORE_DWORD:
655 case AMDGPU::FLAT_STORE_DWORDX2:
656 case AMDGPU::FLAT_STORE_DWORDX3:
657 case AMDGPU::FLAT_STORE_DWORDX4:
658 return AMDGPU::FLAT_STORE_DWORD;
659 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
660 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
661 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
662 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
663 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
664 }
665}
666
667// GLOBAL loads and stores are classified as FLAT initially. If both combined
668// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
669// If either or both instructions are non segment specific FLAT the resulting
670// combined operation will be FLAT, potentially promoting one of the GLOBAL
671// operations to FLAT.
672// For other instructions return the original unmodified class.
673InstClassEnum
674SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
675 const CombineInfo &Paired) {
676 assert(CI.InstClass == Paired.InstClass);
677
678 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
680 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
681
682 return CI.InstClass;
683}
684
685static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
686 AddressRegs Result;
687
688 if (TII.isMUBUF(Opc)) {
690 Result.VAddr = true;
692 Result.SRsrc = true;
694 Result.SOffset = true;
695
696 return Result;
697 }
698
699 if (TII.isImage(Opc)) {
700 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
701 if (VAddr0Idx >= 0) {
702 int RsrcName =
703 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
704 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
705 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
706 } else {
707 Result.VAddr = true;
708 }
709 Result.SRsrc = true;
711 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
712 Result.SSamp = true;
713
714 return Result;
715 }
716 if (TII.isMTBUF(Opc)) {
718 Result.VAddr = true;
720 Result.SRsrc = true;
722 Result.SOffset = true;
723
724 return Result;
725 }
726
727 switch (Opc) {
728 default:
729 return Result;
730 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
731 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
732 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
733 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
734 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
735 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
736 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
737 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
738 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
739 Result.SOffset = true;
740 [[fallthrough]];
741 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
742 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
743 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
744 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
745 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
746 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
747 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
748 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
749 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
750 case AMDGPU::S_LOAD_DWORD_IMM:
751 case AMDGPU::S_LOAD_DWORDX2_IMM:
752 case AMDGPU::S_LOAD_DWORDX3_IMM:
753 case AMDGPU::S_LOAD_DWORDX4_IMM:
754 case AMDGPU::S_LOAD_DWORDX8_IMM:
755 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
756 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
757 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
758 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
759 Result.SBase = true;
760 return Result;
761 case AMDGPU::DS_READ_B32:
762 case AMDGPU::DS_READ_B64:
763 case AMDGPU::DS_READ_B32_gfx9:
764 case AMDGPU::DS_READ_B64_gfx9:
765 case AMDGPU::DS_WRITE_B32:
766 case AMDGPU::DS_WRITE_B64:
767 case AMDGPU::DS_WRITE_B32_gfx9:
768 case AMDGPU::DS_WRITE_B64_gfx9:
769 Result.Addr = true;
770 return Result;
771 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
772 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
773 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
774 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
775 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
776 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
777 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
778 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
779 Result.SAddr = true;
780 [[fallthrough]];
781 case AMDGPU::GLOBAL_LOAD_DWORD:
782 case AMDGPU::GLOBAL_LOAD_DWORDX2:
783 case AMDGPU::GLOBAL_LOAD_DWORDX3:
784 case AMDGPU::GLOBAL_LOAD_DWORDX4:
785 case AMDGPU::GLOBAL_STORE_DWORD:
786 case AMDGPU::GLOBAL_STORE_DWORDX2:
787 case AMDGPU::GLOBAL_STORE_DWORDX3:
788 case AMDGPU::GLOBAL_STORE_DWORDX4:
789 case AMDGPU::FLAT_LOAD_DWORD:
790 case AMDGPU::FLAT_LOAD_DWORDX2:
791 case AMDGPU::FLAT_LOAD_DWORDX3:
792 case AMDGPU::FLAT_LOAD_DWORDX4:
793 case AMDGPU::FLAT_STORE_DWORD:
794 case AMDGPU::FLAT_STORE_DWORDX2:
795 case AMDGPU::FLAT_STORE_DWORDX3:
796 case AMDGPU::FLAT_STORE_DWORDX4:
797 Result.VAddr = true;
798 return Result;
799 }
800}
801
802void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
803 const SILoadStoreOptimizer &LSO) {
804 I = MI;
805 unsigned Opc = MI->getOpcode();
806 InstClass = getInstClass(Opc, *LSO.TII);
807
808 if (InstClass == UNKNOWN)
809 return;
810
811 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
812
813 switch (InstClass) {
814 case DS_READ:
815 EltSize =
816 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
817 : 4;
818 break;
819 case DS_WRITE:
820 EltSize =
821 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
822 : 4;
823 break;
824 case S_BUFFER_LOAD_IMM:
825 case S_BUFFER_LOAD_SGPR_IMM:
826 case S_LOAD_IMM:
827 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
828 break;
829 default:
830 EltSize = 4;
831 break;
832 }
833
834 if (InstClass == MIMG) {
835 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
836 // Offset is not considered for MIMG instructions.
837 Offset = 0;
838 } else {
839 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
840 Offset = I->getOperand(OffsetIdx).getImm();
841 }
842
843 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
844 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
845
846 Width = getOpcodeWidth(*I, *LSO.TII);
847
848 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
849 Offset &= 0xffff;
850 } else if (InstClass != MIMG) {
851 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
852 }
853
854 AddressRegs Regs = getRegs(Opc, *LSO.TII);
855 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
856
857 NumAddresses = 0;
858 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
859 AddrIdx[NumAddresses++] =
860 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
861 if (Regs.Addr)
862 AddrIdx[NumAddresses++] =
863 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
864 if (Regs.SBase)
865 AddrIdx[NumAddresses++] =
866 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
867 if (Regs.SRsrc)
868 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
869 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
870 if (Regs.SOffset)
871 AddrIdx[NumAddresses++] =
872 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
873 if (Regs.SAddr)
874 AddrIdx[NumAddresses++] =
875 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
876 if (Regs.VAddr)
877 AddrIdx[NumAddresses++] =
878 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
879 if (Regs.SSamp)
880 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
881 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
882 assert(NumAddresses <= MaxAddressRegs);
883
884 for (unsigned J = 0; J < NumAddresses; J++)
885 AddrReg[J] = &I->getOperand(AddrIdx[J]);
886}
887
888} // end anonymous namespace.
889
890INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
891 "SI Load Store Optimizer", false, false)
893INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
894 "SI Load Store Optimizer", false, false)
895
896char SILoadStoreOptimizerLegacy::ID = 0;
897
898char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
899
901 return new SILoadStoreOptimizerLegacy();
902}
903
905 DenseSet<Register> &RegDefs,
906 DenseSet<Register> &RegUses) {
907 for (const auto &Op : MI.operands()) {
908 if (!Op.isReg())
909 continue;
910 if (Op.isDef())
911 RegDefs.insert(Op.getReg());
912 if (Op.readsReg())
913 RegUses.insert(Op.getReg());
914 }
915}
916
917bool SILoadStoreOptimizer::canSwapInstructions(
918 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
919 const MachineInstr &A, const MachineInstr &B) const {
920 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
921 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
922 return false;
923 for (const auto &BOp : B.operands()) {
924 if (!BOp.isReg())
925 continue;
926 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
927 return false;
928 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
929 return false;
930 }
931 return true;
932}
933
934// Given that \p CI and \p Paired are adjacent memory operations produce a new
935// MMO for the combined operation with a new access size.
937SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
938 const CombineInfo &Paired) {
939 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
940 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
941
942 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
943
944 // A base pointer for the combined operation is the same as the leading
945 // operation's pointer.
946 if (Paired < CI)
947 std::swap(MMOa, MMOb);
948
949 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
950 // If merging FLAT and GLOBAL set address space to FLAT.
952 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
953
954 MachineFunction *MF = CI.I->getMF();
955 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
956}
957
958bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
959 const SIInstrInfo &TII,
960 const CombineInfo &Paired) {
961 assert(CI.InstClass == MIMG);
962
963 // Ignore instructions with tfe/lwe set.
964 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
965 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
966
967 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
968 return false;
969
970 // Check other optional immediate operands for equality.
971 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
972 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
973 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
974
975 for (auto op : OperandsToMatch) {
976 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
977 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
978 return false;
979 if (Idx != -1 &&
980 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
981 return false;
982 }
983
984 // Check DMask for overlaps.
985 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
986 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
987
988 if (!MaxMask)
989 return false;
990
991 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
992 if ((1u << AllowedBitsForMin) <= MinMask)
993 return false;
994
995 return true;
996}
997
998static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
999 unsigned ComponentCount,
1000 const GCNSubtarget &STI) {
1001 if (ComponentCount > 4)
1002 return 0;
1003
1004 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1006 if (!OldFormatInfo)
1007 return 0;
1008
1009 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1011 ComponentCount,
1012 OldFormatInfo->NumFormat, STI);
1013
1014 if (!NewFormatInfo)
1015 return 0;
1016
1017 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1018 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1019
1020 return NewFormatInfo->Format;
1021}
1022
1023// Return the value in the inclusive range [Lo,Hi] that is aligned to the
1024// highest power of two. Note that the result is well defined for all inputs
1025// including corner cases like:
1026// - if Lo == Hi, return that value
1027// - if Lo == 0, return 0 (even though the "- 1" below underflows
1028// - if Lo > Hi, return 0 (as if the range wrapped around)
1030 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
1031}
1032
1033bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1034 const GCNSubtarget &STI,
1035 CombineInfo &Paired,
1036 bool Modify) {
1037 assert(CI.InstClass != MIMG);
1038
1039 // XXX - Would the same offset be OK? Is there any reason this would happen or
1040 // be useful?
1041 if (CI.Offset == Paired.Offset)
1042 return false;
1043
1044 // This won't be valid if the offset isn't aligned.
1045 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1046 return false;
1047
1048 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1049
1052 if (!Info0)
1053 return false;
1055 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1056 if (!Info1)
1057 return false;
1058
1059 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1060 Info0->NumFormat != Info1->NumFormat)
1061 return false;
1062
1063 // TODO: Should be possible to support more formats, but if format loads
1064 // are not dword-aligned, the merged load might not be valid.
1065 if (Info0->BitsPerComp != 32)
1066 return false;
1067
1068 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1069 return false;
1070 }
1071
1072 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1073 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1074 CI.UseST64 = false;
1075 CI.BaseOff = 0;
1076
1077 // Handle all non-DS instructions.
1078 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1079 if (EltOffset0 + CI.Width != EltOffset1 &&
1080 EltOffset1 + Paired.Width != EltOffset0)
1081 return false;
1082 if (CI.CPol != Paired.CPol)
1083 return false;
1084 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1085 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1086 // Reject cases like:
1087 // dword + dwordx2 -> dwordx3
1088 // dword + dwordx3 -> dwordx4
1089 // If we tried to combine these cases, we would fail to extract a subreg
1090 // for the result of the second load due to SGPR alignment requirements.
1091 if (CI.Width != Paired.Width &&
1092 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1093 return false;
1094 }
1095 return true;
1096 }
1097
1098 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1099 // the stride 64 versions.
1100 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1101 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1102 if (Modify) {
1103 CI.Offset = EltOffset0 / 64;
1104 Paired.Offset = EltOffset1 / 64;
1105 CI.UseST64 = true;
1106 }
1107 return true;
1108 }
1109
1110 // Check if the new offsets fit in the reduced 8-bit range.
1111 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1112 if (Modify) {
1113 CI.Offset = EltOffset0;
1114 Paired.Offset = EltOffset1;
1115 }
1116 return true;
1117 }
1118
1119 // Try to shift base address to decrease offsets.
1120 uint32_t Min = std::min(EltOffset0, EltOffset1);
1121 uint32_t Max = std::max(EltOffset0, EltOffset1);
1122
1123 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1124 if (((Max - Min) & ~Mask) == 0) {
1125 if (Modify) {
1126 // From the range of values we could use for BaseOff, choose the one that
1127 // is aligned to the highest power of two, to maximise the chance that
1128 // the same offset can be reused for other load/store pairs.
1129 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1130 // Copy the low bits of the offsets, so that when we adjust them by
1131 // subtracting BaseOff they will be multiples of 64.
1132 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1133 CI.BaseOff = BaseOff * CI.EltSize;
1134 CI.Offset = (EltOffset0 - BaseOff) / 64;
1135 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1136 CI.UseST64 = true;
1137 }
1138 return true;
1139 }
1140
1141 if (isUInt<8>(Max - Min)) {
1142 if (Modify) {
1143 // From the range of values we could use for BaseOff, choose the one that
1144 // is aligned to the highest power of two, to maximise the chance that
1145 // the same offset can be reused for other load/store pairs.
1146 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1147 CI.BaseOff = BaseOff * CI.EltSize;
1148 CI.Offset = EltOffset0 - BaseOff;
1149 Paired.Offset = EltOffset1 - BaseOff;
1150 }
1151 return true;
1152 }
1153
1154 return false;
1155}
1156
1157bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1158 const CombineInfo &CI,
1159 const CombineInfo &Paired) {
1160 const unsigned Width = (CI.Width + Paired.Width);
1161 switch (CI.InstClass) {
1162 default:
1163 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1164 case S_BUFFER_LOAD_IMM:
1165 case S_BUFFER_LOAD_SGPR_IMM:
1166 case S_LOAD_IMM:
1167 switch (Width) {
1168 default:
1169 return false;
1170 case 2:
1171 case 4:
1172 case 8:
1173 return true;
1174 case 3:
1175 return STM.hasScalarDwordx3Loads();
1176 }
1177 }
1178}
1179
1180const TargetRegisterClass *
1181SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1182 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1183 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1184 }
1185 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1186 return TRI->getRegClassForReg(*MRI, Src->getReg());
1187 }
1188 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1189 return TRI->getRegClassForReg(*MRI, Src->getReg());
1190 }
1191 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1192 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1193 }
1194 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1195 return TRI->getRegClassForReg(*MRI, Src->getReg());
1196 }
1197 return nullptr;
1198}
1199
1200/// This function assumes that CI comes before Paired in a basic block. Return
1201/// an insertion point for the merged instruction or nullptr on failure.
1202SILoadStoreOptimizer::CombineInfo *
1203SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1204 CombineInfo &Paired) {
1205 // If another instruction has already been merged into CI, it may now be a
1206 // type that we can't do any further merging into.
1207 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1208 return nullptr;
1209 assert(CI.InstClass == Paired.InstClass);
1210
1211 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1212 getInstSubclass(Paired.I->getOpcode(), *TII))
1213 return nullptr;
1214
1215 // Check both offsets (or masks for MIMG) can be combined and fit in the
1216 // reduced range.
1217 if (CI.InstClass == MIMG) {
1218 if (!dmasksCanBeCombined(CI, *TII, Paired))
1219 return nullptr;
1220 } else {
1221 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1222 return nullptr;
1223 }
1224
1225 DenseSet<Register> RegDefs;
1226 DenseSet<Register> RegUses;
1227 CombineInfo *Where;
1228 if (CI.I->mayLoad()) {
1229 // Try to hoist Paired up to CI.
1230 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1231 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1232 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1233 return nullptr;
1234 }
1235 Where = &CI;
1236 } else {
1237 // Try to sink CI down to Paired.
1238 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1239 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1240 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1241 return nullptr;
1242 }
1243 Where = &Paired;
1244 }
1245
1246 // Call offsetsCanBeCombined with modify = true so that the offsets are
1247 // correct for the new instruction. This should return true, because
1248 // this function should only be called on CombineInfo objects that
1249 // have already been confirmed to be mergeable.
1250 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1251 offsetsCanBeCombined(CI, *STM, Paired, true);
1252 return Where;
1253}
1254
1255// Copy the merged load result from DestReg to the original dest regs of CI and
1256// Paired.
1257void SILoadStoreOptimizer::copyToDestRegs(
1258 CombineInfo &CI, CombineInfo &Paired,
1259 MachineBasicBlock::iterator InsertBefore, int OpName,
1260 Register DestReg) const {
1261 MachineBasicBlock *MBB = CI.I->getParent();
1262 DebugLoc DL = CI.I->getDebugLoc();
1263
1264 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1265
1266 // Copy to the old destination registers.
1267 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1268 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1269 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1270
1271 // The constrained sload instructions in S_LOAD_IMM class will have
1272 // `early-clobber` flag in the dst operand. Remove the flag before using the
1273 // MOs in copies.
1274 Dest0->setIsEarlyClobber(false);
1275 Dest1->setIsEarlyClobber(false);
1276
1277 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1278 .add(*Dest0) // Copy to same destination including flags and sub reg.
1279 .addReg(DestReg, 0, SubRegIdx0);
1280 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1281 .add(*Dest1)
1282 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1283}
1284
1285// Return a register for the source of the merged store after copying the
1286// original source regs of CI and Paired into it.
1288SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1289 MachineBasicBlock::iterator InsertBefore,
1290 int OpName) const {
1291 MachineBasicBlock *MBB = CI.I->getParent();
1292 DebugLoc DL = CI.I->getDebugLoc();
1293
1294 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1295
1296 // Copy to the new source register.
1297 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1298 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1299
1300 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1301 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1302
1303 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1304 .add(*Src0)
1305 .addImm(SubRegIdx0)
1306 .add(*Src1)
1307 .addImm(SubRegIdx1);
1308
1309 return SrcReg;
1310}
1311
1312unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1313 if (STM->ldsRequiresM0Init())
1314 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1315 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1316}
1317
1318unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1319 if (STM->ldsRequiresM0Init())
1320 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1321
1322 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1323 : AMDGPU::DS_READ2ST64_B64_gfx9;
1324}
1325
1327SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1328 MachineBasicBlock::iterator InsertBefore) {
1329 MachineBasicBlock *MBB = CI.I->getParent();
1330
1331 // Be careful, since the addresses could be subregisters themselves in weird
1332 // cases, like vectors of pointers.
1333 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1334
1335 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1336 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1337 unsigned Opc =
1338 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1339
1340 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1341 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1342
1343 const MCInstrDesc &Read2Desc = TII->get(Opc);
1344
1345 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1346 Register DestReg = MRI->createVirtualRegister(SuperRC);
1347
1348 DebugLoc DL = CI.I->getDebugLoc();
1349
1350 Register BaseReg = AddrReg->getReg();
1351 unsigned BaseSubReg = AddrReg->getSubReg();
1352 unsigned BaseRegFlags = 0;
1353 if (CI.BaseOff) {
1354 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1355 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1356 .addImm(CI.BaseOff);
1357
1358 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1359 BaseRegFlags = RegState::Kill;
1360
1361 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1362 .addReg(ImmReg)
1363 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1364 .addImm(0); // clamp bit
1365 BaseSubReg = 0;
1366 }
1367
1368 MachineInstrBuilder Read2 =
1369 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1370 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1371 .addImm(NewOffset0) // offset0
1372 .addImm(NewOffset1) // offset1
1373 .addImm(0) // gds
1374 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1375
1376 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1377
1378 CI.I->eraseFromParent();
1379 Paired.I->eraseFromParent();
1380
1381 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1382 return Read2;
1383}
1384
1385unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1386 if (STM->ldsRequiresM0Init())
1387 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1388 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1389 : AMDGPU::DS_WRITE2_B64_gfx9;
1390}
1391
1392unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1393 if (STM->ldsRequiresM0Init())
1394 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1395 : AMDGPU::DS_WRITE2ST64_B64;
1396
1397 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1398 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1399}
1400
1401MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1402 CombineInfo &CI, CombineInfo &Paired,
1403 MachineBasicBlock::iterator InsertBefore) {
1404 MachineBasicBlock *MBB = CI.I->getParent();
1405
1406 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1407 // sure we preserve the subregister index and any register flags set on them.
1408 const MachineOperand *AddrReg =
1409 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1410 const MachineOperand *Data0 =
1411 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1412 const MachineOperand *Data1 =
1413 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1414
1415 unsigned NewOffset0 = CI.Offset;
1416 unsigned NewOffset1 = Paired.Offset;
1417 unsigned Opc =
1418 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1419
1420 if (NewOffset0 > NewOffset1) {
1421 // Canonicalize the merged instruction so the smaller offset comes first.
1422 std::swap(NewOffset0, NewOffset1);
1423 std::swap(Data0, Data1);
1424 }
1425
1426 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1427 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1428
1429 const MCInstrDesc &Write2Desc = TII->get(Opc);
1430 DebugLoc DL = CI.I->getDebugLoc();
1431
1432 Register BaseReg = AddrReg->getReg();
1433 unsigned BaseSubReg = AddrReg->getSubReg();
1434 unsigned BaseRegFlags = 0;
1435 if (CI.BaseOff) {
1436 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1437 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1438 .addImm(CI.BaseOff);
1439
1440 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1441 BaseRegFlags = RegState::Kill;
1442
1443 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1444 .addReg(ImmReg)
1445 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1446 .addImm(0); // clamp bit
1447 BaseSubReg = 0;
1448 }
1449
1450 MachineInstrBuilder Write2 =
1451 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1452 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1453 .add(*Data0) // data0
1454 .add(*Data1) // data1
1455 .addImm(NewOffset0) // offset0
1456 .addImm(NewOffset1) // offset1
1457 .addImm(0) // gds
1458 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1459
1460 CI.I->eraseFromParent();
1461 Paired.I->eraseFromParent();
1462
1463 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1464 return Write2;
1465}
1466
1468SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1469 MachineBasicBlock::iterator InsertBefore) {
1470 MachineBasicBlock *MBB = CI.I->getParent();
1471 DebugLoc DL = CI.I->getDebugLoc();
1472 const unsigned Opcode = getNewOpcode(CI, Paired);
1473
1474 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1475
1476 Register DestReg = MRI->createVirtualRegister(SuperRC);
1477 unsigned MergedDMask = CI.DMask | Paired.DMask;
1478 unsigned DMaskIdx =
1479 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1480
1481 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1482 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1483 if (I == DMaskIdx)
1484 MIB.addImm(MergedDMask);
1485 else
1486 MIB.add((*CI.I).getOperand(I));
1487 }
1488
1489 // It shouldn't be possible to get this far if the two instructions
1490 // don't have a single memoperand, because MachineInstr::mayAlias()
1491 // will return true if this is the case.
1492 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1493
1494 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1495
1496 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1497
1498 CI.I->eraseFromParent();
1499 Paired.I->eraseFromParent();
1500 return New;
1501}
1502
1503MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1504 CombineInfo &CI, CombineInfo &Paired,
1505 MachineBasicBlock::iterator InsertBefore) {
1506 MachineBasicBlock *MBB = CI.I->getParent();
1507 DebugLoc DL = CI.I->getDebugLoc();
1508 const unsigned Opcode = getNewOpcode(CI, Paired);
1509
1510 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1511
1512 Register DestReg = MRI->createVirtualRegister(SuperRC);
1513 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1514
1515 // It shouldn't be possible to get this far if the two instructions
1516 // don't have a single memoperand, because MachineInstr::mayAlias()
1517 // will return true if this is the case.
1518 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1519
1521 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1522 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1523 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1524 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1525 New.addImm(MergedOffset);
1526 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1527
1528 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1529
1530 CI.I->eraseFromParent();
1531 Paired.I->eraseFromParent();
1532 return New;
1533}
1534
1535MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1536 CombineInfo &CI, CombineInfo &Paired,
1537 MachineBasicBlock::iterator InsertBefore) {
1538 MachineBasicBlock *MBB = CI.I->getParent();
1539 DebugLoc DL = CI.I->getDebugLoc();
1540
1541 const unsigned Opcode = getNewOpcode(CI, Paired);
1542
1543 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1544
1545 // Copy to the new source register.
1546 Register DestReg = MRI->createVirtualRegister(SuperRC);
1547 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1548
1549 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1550
1551 AddressRegs Regs = getRegs(Opcode, *TII);
1552
1553 if (Regs.VAddr)
1554 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1555
1556 // It shouldn't be possible to get this far if the two instructions
1557 // don't have a single memoperand, because MachineInstr::mayAlias()
1558 // will return true if this is the case.
1559 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1560
1561 MachineInstr *New =
1562 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1563 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1564 .addImm(MergedOffset) // offset
1565 .addImm(CI.CPol) // cpol
1566 .addImm(0) // swz
1567 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1568
1569 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1570
1571 CI.I->eraseFromParent();
1572 Paired.I->eraseFromParent();
1573 return New;
1574}
1575
1576MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1577 CombineInfo &CI, CombineInfo &Paired,
1578 MachineBasicBlock::iterator InsertBefore) {
1579 MachineBasicBlock *MBB = CI.I->getParent();
1580 DebugLoc DL = CI.I->getDebugLoc();
1581
1582 const unsigned Opcode = getNewOpcode(CI, Paired);
1583
1584 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1585
1586 // Copy to the new source register.
1587 Register DestReg = MRI->createVirtualRegister(SuperRC);
1588 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1589
1590 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1591
1592 AddressRegs Regs = getRegs(Opcode, *TII);
1593
1594 if (Regs.VAddr)
1595 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1596
1597 unsigned JoinedFormat =
1598 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1599
1600 // It shouldn't be possible to get this far if the two instructions
1601 // don't have a single memoperand, because MachineInstr::mayAlias()
1602 // will return true if this is the case.
1603 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1604
1605 MachineInstr *New =
1606 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1607 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1608 .addImm(MergedOffset) // offset
1609 .addImm(JoinedFormat) // format
1610 .addImm(CI.CPol) // cpol
1611 .addImm(0) // swz
1612 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1613
1614 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1615
1616 CI.I->eraseFromParent();
1617 Paired.I->eraseFromParent();
1618 return New;
1619}
1620
1621MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1622 CombineInfo &CI, CombineInfo &Paired,
1623 MachineBasicBlock::iterator InsertBefore) {
1624 MachineBasicBlock *MBB = CI.I->getParent();
1625 DebugLoc DL = CI.I->getDebugLoc();
1626
1627 const unsigned Opcode = getNewOpcode(CI, Paired);
1628
1629 Register SrcReg =
1630 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1631
1632 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1633 .addReg(SrcReg, RegState::Kill);
1634
1635 AddressRegs Regs = getRegs(Opcode, *TII);
1636
1637 if (Regs.VAddr)
1638 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1639
1640 unsigned JoinedFormat =
1641 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1642
1643 // It shouldn't be possible to get this far if the two instructions
1644 // don't have a single memoperand, because MachineInstr::mayAlias()
1645 // will return true if this is the case.
1646 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1647
1648 MachineInstr *New =
1649 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1650 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1651 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1652 .addImm(JoinedFormat) // format
1653 .addImm(CI.CPol) // cpol
1654 .addImm(0) // swz
1655 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1656
1657 CI.I->eraseFromParent();
1658 Paired.I->eraseFromParent();
1659 return New;
1660}
1661
1662MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1663 CombineInfo &CI, CombineInfo &Paired,
1664 MachineBasicBlock::iterator InsertBefore) {
1665 MachineBasicBlock *MBB = CI.I->getParent();
1666 DebugLoc DL = CI.I->getDebugLoc();
1667
1668 const unsigned Opcode = getNewOpcode(CI, Paired);
1669
1670 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1671 Register DestReg = MRI->createVirtualRegister(SuperRC);
1672
1673 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1674
1675 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1676 MIB.add(*SAddr);
1677
1678 MachineInstr *New =
1679 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1680 .addImm(std::min(CI.Offset, Paired.Offset))
1681 .addImm(CI.CPol)
1682 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1683
1684 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1685
1686 CI.I->eraseFromParent();
1687 Paired.I->eraseFromParent();
1688 return New;
1689}
1690
1691MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1692 CombineInfo &CI, CombineInfo &Paired,
1693 MachineBasicBlock::iterator InsertBefore) {
1694 MachineBasicBlock *MBB = CI.I->getParent();
1695 DebugLoc DL = CI.I->getDebugLoc();
1696
1697 const unsigned Opcode = getNewOpcode(CI, Paired);
1698
1699 Register SrcReg =
1700 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1701
1702 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1703 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1704 .addReg(SrcReg, RegState::Kill);
1705
1706 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1707 MIB.add(*SAddr);
1708
1709 MachineInstr *New =
1710 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1711 .addImm(CI.CPol)
1712 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1713
1714 CI.I->eraseFromParent();
1715 Paired.I->eraseFromParent();
1716 return New;
1717}
1718
1721 unsigned Width) {
1722 // Conservatively returns true if not found the MMO.
1723 return STM.isXNACKEnabled() &&
1724 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1725}
1726
1727unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1728 const CombineInfo &Paired) {
1729 const unsigned Width = CI.Width + Paired.Width;
1730
1731 switch (getCommonInstClass(CI, Paired)) {
1732 default:
1733 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1734 // FIXME: Handle d16 correctly
1735 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1736 Width);
1737 case TBUFFER_LOAD:
1738 case TBUFFER_STORE:
1739 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1740 Width);
1741
1742 case UNKNOWN:
1743 llvm_unreachable("Unknown instruction class");
1744 case S_BUFFER_LOAD_IMM: {
1745 // If XNACK is enabled, use the constrained opcodes when the first load is
1746 // under-aligned.
1747 bool NeedsConstrainedOpc =
1748 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1749 switch (Width) {
1750 default:
1751 return 0;
1752 case 2:
1753 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1754 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1755 case 3:
1756 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1757 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1758 case 4:
1759 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1760 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1761 case 8:
1762 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1763 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1764 }
1765 }
1766 case S_BUFFER_LOAD_SGPR_IMM: {
1767 // If XNACK is enabled, use the constrained opcodes when the first load is
1768 // under-aligned.
1769 bool NeedsConstrainedOpc =
1770 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1771 switch (Width) {
1772 default:
1773 return 0;
1774 case 2:
1775 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1776 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1777 case 3:
1778 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1779 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1780 case 4:
1781 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1782 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1783 case 8:
1784 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1785 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1786 }
1787 }
1788 case S_LOAD_IMM: {
1789 // If XNACK is enabled, use the constrained opcodes when the first load is
1790 // under-aligned.
1791 bool NeedsConstrainedOpc =
1792 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1793 switch (Width) {
1794 default:
1795 return 0;
1796 case 2:
1797 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1798 : AMDGPU::S_LOAD_DWORDX2_IMM;
1799 case 3:
1800 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1801 : AMDGPU::S_LOAD_DWORDX3_IMM;
1802 case 4:
1803 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1804 : AMDGPU::S_LOAD_DWORDX4_IMM;
1805 case 8:
1806 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1807 : AMDGPU::S_LOAD_DWORDX8_IMM;
1808 }
1809 }
1810 case GLOBAL_LOAD:
1811 switch (Width) {
1812 default:
1813 return 0;
1814 case 2:
1815 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1816 case 3:
1817 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1818 case 4:
1819 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1820 }
1821 case GLOBAL_LOAD_SADDR:
1822 switch (Width) {
1823 default:
1824 return 0;
1825 case 2:
1826 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1827 case 3:
1828 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1829 case 4:
1830 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1831 }
1832 case GLOBAL_STORE:
1833 switch (Width) {
1834 default:
1835 return 0;
1836 case 2:
1837 return AMDGPU::GLOBAL_STORE_DWORDX2;
1838 case 3:
1839 return AMDGPU::GLOBAL_STORE_DWORDX3;
1840 case 4:
1841 return AMDGPU::GLOBAL_STORE_DWORDX4;
1842 }
1843 case GLOBAL_STORE_SADDR:
1844 switch (Width) {
1845 default:
1846 return 0;
1847 case 2:
1848 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1849 case 3:
1850 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1851 case 4:
1852 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1853 }
1854 case FLAT_LOAD:
1855 switch (Width) {
1856 default:
1857 return 0;
1858 case 2:
1859 return AMDGPU::FLAT_LOAD_DWORDX2;
1860 case 3:
1861 return AMDGPU::FLAT_LOAD_DWORDX3;
1862 case 4:
1863 return AMDGPU::FLAT_LOAD_DWORDX4;
1864 }
1865 case FLAT_STORE:
1866 switch (Width) {
1867 default:
1868 return 0;
1869 case 2:
1870 return AMDGPU::FLAT_STORE_DWORDX2;
1871 case 3:
1872 return AMDGPU::FLAT_STORE_DWORDX3;
1873 case 4:
1874 return AMDGPU::FLAT_STORE_DWORDX4;
1875 }
1876 case MIMG:
1877 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1878 "No overlaps");
1879 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1880 }
1881}
1882
1883std::pair<unsigned, unsigned>
1884SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1885 const CombineInfo &Paired) {
1886 assert((CI.InstClass != MIMG ||
1887 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1888 CI.Width + Paired.Width)) &&
1889 "No overlaps");
1890
1891 unsigned Idx0;
1892 unsigned Idx1;
1893
1894 static const unsigned Idxs[5][4] = {
1895 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1896 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1897 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1898 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1899 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1900 };
1901
1902 assert(CI.Width >= 1 && CI.Width <= 4);
1903 assert(Paired.Width >= 1 && Paired.Width <= 4);
1904
1905 if (Paired < CI) {
1906 Idx1 = Idxs[0][Paired.Width - 1];
1907 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1908 } else {
1909 Idx0 = Idxs[0][CI.Width - 1];
1910 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1911 }
1912
1913 return {Idx0, Idx1};
1914}
1915
1916const TargetRegisterClass *
1917SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1918 const CombineInfo &Paired) const {
1919 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1920 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1921 switch (CI.Width + Paired.Width) {
1922 default:
1923 return nullptr;
1924 case 2:
1925 return &AMDGPU::SReg_64_XEXECRegClass;
1926 case 3:
1927 return &AMDGPU::SGPR_96RegClass;
1928 case 4:
1929 return &AMDGPU::SGPR_128RegClass;
1930 case 8:
1931 return &AMDGPU::SGPR_256RegClass;
1932 case 16:
1933 return &AMDGPU::SGPR_512RegClass;
1934 }
1935 }
1936
1937 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1938 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1939 ? TRI->getAGPRClassForBitWidth(BitWidth)
1940 : TRI->getVGPRClassForBitWidth(BitWidth);
1941}
1942
1943MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1944 CombineInfo &CI, CombineInfo &Paired,
1945 MachineBasicBlock::iterator InsertBefore) {
1946 MachineBasicBlock *MBB = CI.I->getParent();
1947 DebugLoc DL = CI.I->getDebugLoc();
1948
1949 const unsigned Opcode = getNewOpcode(CI, Paired);
1950
1951 Register SrcReg =
1952 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1953
1954 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1955 .addReg(SrcReg, RegState::Kill);
1956
1957 AddressRegs Regs = getRegs(Opcode, *TII);
1958
1959 if (Regs.VAddr)
1960 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1961
1962
1963 // It shouldn't be possible to get this far if the two instructions
1964 // don't have a single memoperand, because MachineInstr::mayAlias()
1965 // will return true if this is the case.
1966 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1967
1968 MachineInstr *New =
1969 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1970 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1971 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1972 .addImm(CI.CPol) // cpol
1973 .addImm(0) // swz
1974 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1975
1976 CI.I->eraseFromParent();
1977 Paired.I->eraseFromParent();
1978 return New;
1979}
1980
1982SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1983 APInt V(32, Val, true);
1984 if (TII->isInlineConstant(V))
1985 return MachineOperand::CreateImm(Val);
1986
1987 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1988 MachineInstr *Mov =
1989 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1990 TII->get(AMDGPU::S_MOV_B32), Reg)
1991 .addImm(Val);
1992 (void)Mov;
1993 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1994 return MachineOperand::CreateReg(Reg, false);
1995}
1996
1997// Compute base address using Addr and return the final register.
1998Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1999 const MemAddress &Addr) const {
2000 MachineBasicBlock *MBB = MI.getParent();
2001 MachineBasicBlock::iterator MBBI = MI.getIterator();
2002 DebugLoc DL = MI.getDebugLoc();
2003
2004 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2005 Addr.Base.LoSubReg) &&
2006 "Expected 32-bit Base-Register-Low!!");
2007
2008 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2009 Addr.Base.HiSubReg) &&
2010 "Expected 32-bit Base-Register-Hi!!");
2011
2012 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
2013 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
2014 MachineOperand OffsetHi =
2015 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
2016
2017 const auto *CarryRC = TRI->getWaveMaskRegClass();
2018 Register CarryReg = MRI->createVirtualRegister(CarryRC);
2019 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2020
2021 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2022 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2023 MachineInstr *LoHalf =
2024 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2025 .addReg(CarryReg, RegState::Define)
2026 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2027 .add(OffsetLo)
2028 .addImm(0); // clamp bit
2029 (void)LoHalf;
2030 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
2031
2032 MachineInstr *HiHalf =
2033 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2034 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2035 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2036 .add(OffsetHi)
2037 .addReg(CarryReg, RegState::Kill)
2038 .addImm(0); // clamp bit
2039 (void)HiHalf;
2040 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
2041
2042 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2043 MachineInstr *FullBase =
2044 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2045 .addReg(DestSub0)
2046 .addImm(AMDGPU::sub0)
2047 .addReg(DestSub1)
2048 .addImm(AMDGPU::sub1);
2049 (void)FullBase;
2050 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
2051
2052 return FullDestReg;
2053}
2054
2055// Update base and offset with the NewBase and NewOffset in MI.
2056void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2057 Register NewBase,
2058 int32_t NewOffset) const {
2059 auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2060 Base->setReg(NewBase);
2061 Base->setIsKill(false);
2062 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2063}
2064
2065std::optional<int32_t>
2066SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2067 if (Op.isImm())
2068 return Op.getImm();
2069
2070 if (!Op.isReg())
2071 return std::nullopt;
2072
2073 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2074 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2075 !Def->getOperand(1).isImm())
2076 return std::nullopt;
2077
2078 return Def->getOperand(1).getImm();
2079}
2080
2081// Analyze Base and extracts:
2082// - 32bit base registers, subregisters
2083// - 64bit constant offset
2084// Expecting base computation as:
2085// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2086// %LO:vgpr_32, %c:sreg_64_xexec =
2087// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2088// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2089// %Base:vreg_64 =
2090// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2091void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2092 MemAddress &Addr) const {
2093 if (!Base.isReg())
2094 return;
2095
2096 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2097 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2098 || Def->getNumOperands() != 5)
2099 return;
2100
2101 MachineOperand BaseLo = Def->getOperand(1);
2102 MachineOperand BaseHi = Def->getOperand(3);
2103 if (!BaseLo.isReg() || !BaseHi.isReg())
2104 return;
2105
2106 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2107 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2108
2109 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2110 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2111 return;
2112
2113 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2114 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2115
2116 auto Offset0P = extractConstOffset(*Src0);
2117 if (Offset0P)
2118 BaseLo = *Src1;
2119 else {
2120 if (!(Offset0P = extractConstOffset(*Src1)))
2121 return;
2122 BaseLo = *Src0;
2123 }
2124
2125 if (!BaseLo.isReg())
2126 return;
2127
2128 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2129 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2130
2131 if (Src0->isImm())
2132 std::swap(Src0, Src1);
2133
2134 if (!Src1->isImm() || Src0->isImm())
2135 return;
2136
2137 uint64_t Offset1 = Src1->getImm();
2138 BaseHi = *Src0;
2139
2140 if (!BaseHi.isReg())
2141 return;
2142
2143 Addr.Base.LoReg = BaseLo.getReg();
2144 Addr.Base.HiReg = BaseHi.getReg();
2145 Addr.Base.LoSubReg = BaseLo.getSubReg();
2146 Addr.Base.HiSubReg = BaseHi.getSubReg();
2147 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2148}
2149
2150bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2152 MemInfoMap &Visited,
2154
2155 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2156 return false;
2157
2158 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2160 return false;
2161
2164
2165 if (AnchorList.count(&MI))
2166 return false;
2167
2168 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2169
2170 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2171 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2172 return false;
2173 }
2174
2175 // Step1: Find the base-registers and a 64bit constant offset.
2176 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2177 MemAddress MAddr;
2178 if (!Visited.contains(&MI)) {
2179 processBaseWithConstOffset(Base, MAddr);
2180 Visited[&MI] = MAddr;
2181 } else
2182 MAddr = Visited[&MI];
2183
2184 if (MAddr.Offset == 0) {
2185 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2186 " constant offsets that can be promoted.\n";);
2187 return false;
2188 }
2189
2190 LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", "
2191 << printReg(MAddr.Base.LoReg, TRI)
2192 << "} Offset: " << MAddr.Offset << "\n\n";);
2193
2194 // Step2: Traverse through MI's basic block and find an anchor(that has the
2195 // same base-registers) with the highest 13bit distance from MI's offset.
2196 // E.g. (64bit loads)
2197 // bb:
2198 // addr1 = &a + 4096; load1 = load(addr1, 0)
2199 // addr2 = &a + 6144; load2 = load(addr2, 0)
2200 // addr3 = &a + 8192; load3 = load(addr3, 0)
2201 // addr4 = &a + 10240; load4 = load(addr4, 0)
2202 // addr5 = &a + 12288; load5 = load(addr5, 0)
2203 //
2204 // Starting from the first load, the optimization will try to find a new base
2205 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2206 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2207 // as the new-base(anchor) because of the maximum distance which can
2208 // accommodate more intermediate bases presumably.
2209 //
2210 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2211 // (&a + 8192) for load1, load2, load4.
2212 // addr = &a + 8192
2213 // load1 = load(addr, -4096)
2214 // load2 = load(addr, -2048)
2215 // load3 = load(addr, 0)
2216 // load4 = load(addr, 2048)
2217 // addr5 = &a + 12288; load5 = load(addr5, 0)
2218 //
2219 MachineInstr *AnchorInst = nullptr;
2220 MemAddress AnchorAddr;
2221 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2223
2224 MachineBasicBlock *MBB = MI.getParent();
2226 MachineBasicBlock::iterator MBBI = MI.getIterator();
2227 ++MBBI;
2228 const SITargetLowering *TLI =
2229 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2230
2231 for ( ; MBBI != E; ++MBBI) {
2232 MachineInstr &MINext = *MBBI;
2233 // TODO: Support finding an anchor(with same base) from store addresses or
2234 // any other load addresses where the opcodes are different.
2235 if (MINext.getOpcode() != MI.getOpcode() ||
2236 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2237 continue;
2238
2239 const MachineOperand &BaseNext =
2240 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2241 MemAddress MAddrNext;
2242 if (!Visited.contains(&MINext)) {
2243 processBaseWithConstOffset(BaseNext, MAddrNext);
2244 Visited[&MINext] = MAddrNext;
2245 } else
2246 MAddrNext = Visited[&MINext];
2247
2248 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2249 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2250 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2251 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2252 continue;
2253
2254 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2255
2256 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2258 AM.HasBaseReg = true;
2259 AM.BaseOffs = Dist;
2260 if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2261 (uint32_t)std::abs(Dist) > MaxDist) {
2262 MaxDist = std::abs(Dist);
2263
2264 AnchorAddr = MAddrNext;
2265 AnchorInst = &MINext;
2266 }
2267 }
2268
2269 if (AnchorInst) {
2270 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2271 AnchorInst->dump());
2272 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2273 << AnchorAddr.Offset << "\n\n");
2274
2275 // Instead of moving up, just re-compute anchor-instruction's base address.
2276 Register Base = computeBase(MI, AnchorAddr);
2277
2278 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2279 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2280
2281 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2283 AM.HasBaseReg = true;
2284 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2285
2286 if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2287 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2288 OtherMI->dump());
2289 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2290 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2291 }
2292 }
2293 AnchorList.insert(AnchorInst);
2294 return true;
2295 }
2296
2297 return false;
2298}
2299
2300void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2301 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2302 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2303 if (AddrList.front().InstClass == CI.InstClass &&
2304 AddrList.front().IsAGPR == CI.IsAGPR &&
2305 AddrList.front().hasSameBaseAddress(CI)) {
2306 AddrList.emplace_back(CI);
2307 return;
2308 }
2309 }
2310
2311 // Base address not found, so add a new list.
2312 MergeableInsts.emplace_back(1, CI);
2313}
2314
2315std::pair<MachineBasicBlock::iterator, bool>
2316SILoadStoreOptimizer::collectMergeableInsts(
2318 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2319 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2320 bool Modified = false;
2321
2322 // Sort potential mergeable instructions into lists. One list per base address.
2323 unsigned Order = 0;
2324 MachineBasicBlock::iterator BlockI = Begin;
2325 for (; BlockI != End; ++BlockI) {
2326 MachineInstr &MI = *BlockI;
2327
2328 // We run this before checking if an address is mergeable, because it can produce
2329 // better code even if the instructions aren't mergeable.
2330 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2331 Modified = true;
2332
2333 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2334 // barriers. We can look after this barrier for separate merges.
2335 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2336 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2337
2338 // Search will resume after this instruction in a separate merge list.
2339 ++BlockI;
2340 break;
2341 }
2342
2343 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2344 if (InstClass == UNKNOWN)
2345 continue;
2346
2347 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2348 int Swizzled =
2349 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2350 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2351 continue;
2352
2353 CombineInfo CI;
2354 CI.setMI(MI, *this);
2355 CI.Order = Order++;
2356
2357 if (!CI.hasMergeableAddress(*MRI))
2358 continue;
2359
2360 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2361 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2362 // operands. However we are reporting that ds_write2 shall have
2363 // only VGPR data so that machine copy propagation does not
2364 // create an illegal instruction with a VGPR and AGPR sources.
2365 // Consequenctially if we create such instruction the verifier
2366 // will complain.
2367 continue;
2368 }
2369
2370 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2371
2372 addInstToMergeableList(CI, MergeableInsts);
2373 }
2374
2375 // At this point we have lists of Mergeable instructions.
2376 //
2377 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2378 // list try to find an instruction that can be merged with I. If an instruction
2379 // is found, it is stored in the Paired field. If no instructions are found, then
2380 // the CombineInfo object is deleted from the list.
2381
2382 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2383 E = MergeableInsts.end(); I != E;) {
2384
2385 std::list<CombineInfo> &MergeList = *I;
2386 if (MergeList.size() <= 1) {
2387 // This means we have found only one instruction with a given address
2388 // that can be merged, and we need at least 2 instructions to do a merge,
2389 // so this list can be discarded.
2390 I = MergeableInsts.erase(I);
2391 continue;
2392 }
2393
2394 // Sort the lists by offsets, this way mergeable instructions will be
2395 // adjacent to each other in the list, which will make it easier to find
2396 // matches.
2397 MergeList.sort(
2398 [] (const CombineInfo &A, const CombineInfo &B) {
2399 return A.Offset < B.Offset;
2400 });
2401 ++I;
2402 }
2403
2404 return {BlockI, Modified};
2405}
2406
2407// Scan through looking for adjacent LDS operations with constant offsets from
2408// the same base register. We rely on the scheduler to do the hard work of
2409// clustering nearby loads, and assume these are all adjacent.
2410bool SILoadStoreOptimizer::optimizeBlock(
2411 std::list<std::list<CombineInfo> > &MergeableInsts) {
2412 bool Modified = false;
2413
2414 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2415 E = MergeableInsts.end(); I != E;) {
2416 std::list<CombineInfo> &MergeList = *I;
2417
2418 bool OptimizeListAgain = false;
2419 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2420 // We weren't able to make any changes, so delete the list so we don't
2421 // process the same instructions the next time we try to optimize this
2422 // block.
2423 I = MergeableInsts.erase(I);
2424 continue;
2425 }
2426
2427 Modified = true;
2428
2429 // We made changes, but also determined that there were no more optimization
2430 // opportunities, so we don't need to reprocess the list
2431 if (!OptimizeListAgain) {
2432 I = MergeableInsts.erase(I);
2433 continue;
2434 }
2435 OptimizeAgain = true;
2436 }
2437 return Modified;
2438}
2439
2440bool
2441SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2442 std::list<CombineInfo> &MergeList,
2443 bool &OptimizeListAgain) {
2444 if (MergeList.empty())
2445 return false;
2446
2447 bool Modified = false;
2448
2449 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2450 Next = std::next(I)) {
2451
2452 auto First = I;
2453 auto Second = Next;
2454
2455 if ((*First).Order > (*Second).Order)
2456 std::swap(First, Second);
2457 CombineInfo &CI = *First;
2458 CombineInfo &Paired = *Second;
2459
2460 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2461 if (!Where) {
2462 ++I;
2463 continue;
2464 }
2465
2466 Modified = true;
2467
2468 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2469
2471 switch (CI.InstClass) {
2472 default:
2473 llvm_unreachable("unknown InstClass");
2474 break;
2475 case DS_READ:
2476 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2477 break;
2478 case DS_WRITE:
2479 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2480 break;
2481 case S_BUFFER_LOAD_IMM:
2482 case S_BUFFER_LOAD_SGPR_IMM:
2483 case S_LOAD_IMM:
2484 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2485 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2486 break;
2487 case BUFFER_LOAD:
2488 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2489 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2490 break;
2491 case BUFFER_STORE:
2492 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2493 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2494 break;
2495 case MIMG:
2496 NewMI = mergeImagePair(CI, Paired, Where->I);
2497 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2498 break;
2499 case TBUFFER_LOAD:
2500 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2501 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2502 break;
2503 case TBUFFER_STORE:
2504 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2505 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2506 break;
2507 case FLAT_LOAD:
2508 case GLOBAL_LOAD:
2509 case GLOBAL_LOAD_SADDR:
2510 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2511 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2512 break;
2513 case FLAT_STORE:
2514 case GLOBAL_STORE:
2515 case GLOBAL_STORE_SADDR:
2516 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2517 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2518 break;
2519 }
2520 CI.setMI(NewMI, *this);
2521 CI.Order = Where->Order;
2522 if (I == Second)
2523 I = Next;
2524
2525 MergeList.erase(Second);
2526 }
2527
2528 return Modified;
2529}
2530
2531bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2532 if (skipFunction(MF.getFunction()))
2533 return false;
2534 return SILoadStoreOptimizer(
2535 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2536 .run(MF);
2537}
2538
2539bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2540 STM = &MF.getSubtarget<GCNSubtarget>();
2541 if (!STM->loadStoreOptEnabled())
2542 return false;
2543
2544 TII = STM->getInstrInfo();
2545 TRI = &TII->getRegisterInfo();
2546
2547 MRI = &MF.getRegInfo();
2548
2549 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2550
2551 bool Modified = false;
2552
2553 // Contains the list of instructions for which constant offsets are being
2554 // promoted to the IMM. This is tracked for an entire block at time.
2556 MemInfoMap Visited;
2557
2558 for (MachineBasicBlock &MBB : MF) {
2559 MachineBasicBlock::iterator SectionEnd;
2560 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2561 I = SectionEnd) {
2562 bool CollectModified;
2563 std::list<std::list<CombineInfo>> MergeableInsts;
2564
2565 // First pass: Collect list of all instructions we know how to merge in a
2566 // subset of the block.
2567 std::tie(SectionEnd, CollectModified) =
2568 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2569
2570 Modified |= CollectModified;
2571
2572 do {
2573 OptimizeAgain = false;
2574 Modified |= optimizeBlock(MergeableInsts);
2575 } while (OptimizeAgain);
2576 }
2577
2578 Visited.clear();
2579 AnchorList.clear();
2580 }
2581
2582 return Modified;
2583}
2584
2588 MFPropsModifier _(*this, MF);
2589
2590 if (MF.getFunction().hasOptNone())
2591 return PreservedAnalyses::all();
2592
2594 .getManager();
2596
2597 bool Changed = SILoadStoreOptimizer(&AA).run(MF);
2598 if (!Changed)
2599 return PreservedAnalyses::all();
2600
2603 return PA;
2604}
unsigned const MachineRegisterInfo * MRI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1315
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
#define op(i)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
SI Load Store Optimizer
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
#define DEBUG_TYPE
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
Definition: aarch32.cpp:204
support::ulittle16_t & Hi
Definition: aarch32.cpp:203
A manager for alias analyses.
Result run(Function &F, FunctionAnalysisManager &AM)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition: APInt.h:78
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
bool hasOptNone() const
Do not optimize this function (-O0).
Definition: Function.h:701
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:287
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:716
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:619
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
An RAII based helper class to modify MachineFunctionProperties when running pass.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void dump() const
Definition: Pass.cpp:136
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:642
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:634
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:361
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createSILoadStoreOptimizerLegacyPass()
char & SILoadStoreOptimizerLegacyID
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...