LLVM 19.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "AMDGPU.h"
61#include "GCNSubtarget.h"
66
67using namespace llvm;
68
69#define DEBUG_TYPE "si-load-store-opt"
70
71namespace {
72enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 S_BUFFER_LOAD_SGPR_IMM,
78 S_LOAD_IMM,
79 BUFFER_LOAD,
80 BUFFER_STORE,
81 MIMG,
82 TBUFFER_LOAD,
83 TBUFFER_STORE,
84 GLOBAL_LOAD_SADDR,
85 GLOBAL_STORE_SADDR,
86 FLAT_LOAD,
87 FLAT_STORE,
88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89 GLOBAL_STORE // any CombineInfo, they are only ever returned by
90 // getCommonInstClass.
91};
92
93struct AddressRegs {
94 unsigned char NumVAddrs = 0;
95 bool SBase = false;
96 bool SRsrc = false;
97 bool SOffset = false;
98 bool SAddr = false;
99 bool VAddr = false;
100 bool Addr = false;
101 bool SSamp = false;
102};
103
104// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105const unsigned MaxAddressRegs = 12 + 1 + 1;
106
107class SILoadStoreOptimizer : public MachineFunctionPass {
108 struct CombineInfo {
110 unsigned EltSize;
111 unsigned Offset;
112 unsigned Width;
113 unsigned Format;
114 unsigned BaseOff;
115 unsigned DMask;
116 InstClassEnum InstClass;
117 unsigned CPol = 0;
118 bool IsAGPR;
119 bool UseST64;
120 int AddrIdx[MaxAddressRegs];
121 const MachineOperand *AddrReg[MaxAddressRegs];
122 unsigned NumAddresses;
123 unsigned Order;
124
125 bool hasSameBaseAddress(const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
127 return false;
128
129 const MachineInstr &MI = *CI.I;
130 for (unsigned i = 0; i < NumAddresses; i++) {
131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132
133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136 return false;
137 }
138 continue;
139 }
140
141 // Check same base pointer. Be careful of subregisters, which can occur
142 // with vectors of pointers.
143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145 return false;
146 }
147 }
148 return true;
149 }
150
151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152 for (unsigned i = 0; i < NumAddresses; ++i) {
153 const MachineOperand *AddrOp = AddrReg[i];
154 // Immediates are always OK.
155 if (AddrOp->isImm())
156 continue;
157
158 // Don't try to merge addresses that aren't either immediates or registers.
159 // TODO: Should be possible to merge FrameIndexes and maybe some other
160 // non-register
161 if (!AddrOp->isReg())
162 return false;
163
164 // TODO: We should be able to merge instructions with other physical reg
165 // addresses too.
166 if (AddrOp->getReg().isPhysical() &&
167 AddrOp->getReg() != AMDGPU::SGPR_NULL)
168 return false;
169
170 // If an address has only one use then there will be no other
171 // instructions with the same address, so we can't merge this one.
172 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173 return false;
174 }
175 return true;
176 }
177
178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179
180 // Compare by pointer order.
181 bool operator<(const CombineInfo& Other) const {
182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183 }
184 };
185
186 struct BaseRegisters {
187 Register LoReg;
188 Register HiReg;
189
190 unsigned LoSubReg = 0;
191 unsigned HiSubReg = 0;
192 };
193
194 struct MemAddress {
195 BaseRegisters Base;
196 int64_t Offset = 0;
197 };
198
199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200
201private:
202 const GCNSubtarget *STM = nullptr;
203 const SIInstrInfo *TII = nullptr;
204 const SIRegisterInfo *TRI = nullptr;
205 MachineRegisterInfo *MRI = nullptr;
206 AliasAnalysis *AA = nullptr;
207 bool OptimizeAgain;
208
209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210 const DenseSet<Register> &ARegUses,
211 const MachineInstr &A, const MachineInstr &B) const;
212 static bool dmasksCanBeCombined(const CombineInfo &CI,
213 const SIInstrInfo &TII,
214 const CombineInfo &Paired);
215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216 CombineInfo &Paired, bool Modify = false);
217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218 const CombineInfo &Paired);
219 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221 const CombineInfo &Paired);
222 const TargetRegisterClass *
223 getTargetRegisterClass(const CombineInfo &CI,
224 const CombineInfo &Paired) const;
225 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
226
227 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
228
229 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
230 MachineBasicBlock::iterator InsertBefore, int OpName,
231 Register DestReg) const;
232 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
233 MachineBasicBlock::iterator InsertBefore,
234 int OpName) const;
235
236 unsigned read2Opcode(unsigned EltSize) const;
237 unsigned read2ST64Opcode(unsigned EltSize) const;
239 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
240 MachineBasicBlock::iterator InsertBefore);
241
242 unsigned write2Opcode(unsigned EltSize) const;
243 unsigned write2ST64Opcode(unsigned EltSize) const;
245 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
246 MachineBasicBlock::iterator InsertBefore);
248 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
249 MachineBasicBlock::iterator InsertBefore);
251 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
252 MachineBasicBlock::iterator InsertBefore);
254 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
255 MachineBasicBlock::iterator InsertBefore);
257 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
258 MachineBasicBlock::iterator InsertBefore);
260 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
261 MachineBasicBlock::iterator InsertBefore);
263 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
264 MachineBasicBlock::iterator InsertBefore);
266 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
267 MachineBasicBlock::iterator InsertBefore);
269 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
270 MachineBasicBlock::iterator InsertBefore);
271
272 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
273 int32_t NewOffset) const;
274 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
275 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
276 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
277 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
278 /// Promotes constant offset to the immediate by adjusting the base. It
279 /// tries to use a base from the nearby instructions that allows it to have
280 /// a 13bit constant offset which gets promoted to the immediate.
281 bool promoteConstantOffsetToImm(MachineInstr &CI,
282 MemInfoMap &Visited,
283 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
284 void addInstToMergeableList(const CombineInfo &CI,
285 std::list<std::list<CombineInfo> > &MergeableInsts) const;
286
287 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
289 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
290 std::list<std::list<CombineInfo>> &MergeableInsts) const;
291
292 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
293 const CombineInfo &Paired);
294
295 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
296 const CombineInfo &Paired);
297
298public:
299 static char ID;
300
301 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
303 }
304
305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306 bool &OptimizeListAgain);
307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
308
309 bool runOnMachineFunction(MachineFunction &MF) override;
310
311 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
312
313 void getAnalysisUsage(AnalysisUsage &AU) const override {
314 AU.setPreservesCFG();
316
318 }
319
320 MachineFunctionProperties getRequiredProperties() const override {
322 .set(MachineFunctionProperties::Property::IsSSA);
323 }
324};
325
326static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
327 const unsigned Opc = MI.getOpcode();
328
329 if (TII.isMUBUF(Opc)) {
330 // FIXME: Handle d16 correctly
331 return AMDGPU::getMUBUFElements(Opc);
332 }
333 if (TII.isImage(MI)) {
334 uint64_t DMaskImm =
335 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
336 return llvm::popcount(DMaskImm);
337 }
338 if (TII.isMTBUF(Opc)) {
339 return AMDGPU::getMTBUFElements(Opc);
340 }
341
342 switch (Opc) {
343 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345 case AMDGPU::S_LOAD_DWORD_IMM:
346 case AMDGPU::GLOBAL_LOAD_DWORD:
347 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348 case AMDGPU::GLOBAL_STORE_DWORD:
349 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350 case AMDGPU::FLAT_LOAD_DWORD:
351 case AMDGPU::FLAT_STORE_DWORD:
352 return 1;
353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355 case AMDGPU::S_LOAD_DWORDX2_IMM:
356 case AMDGPU::GLOBAL_LOAD_DWORDX2:
357 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
358 case AMDGPU::GLOBAL_STORE_DWORDX2:
359 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
360 case AMDGPU::FLAT_LOAD_DWORDX2:
361 case AMDGPU::FLAT_STORE_DWORDX2:
362 return 2;
363 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
364 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
365 case AMDGPU::S_LOAD_DWORDX3_IMM:
366 case AMDGPU::GLOBAL_LOAD_DWORDX3:
367 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
368 case AMDGPU::GLOBAL_STORE_DWORDX3:
369 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
370 case AMDGPU::FLAT_LOAD_DWORDX3:
371 case AMDGPU::FLAT_STORE_DWORDX3:
372 return 3;
373 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
375 case AMDGPU::S_LOAD_DWORDX4_IMM:
376 case AMDGPU::GLOBAL_LOAD_DWORDX4:
377 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
378 case AMDGPU::GLOBAL_STORE_DWORDX4:
379 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
380 case AMDGPU::FLAT_LOAD_DWORDX4:
381 case AMDGPU::FLAT_STORE_DWORDX4:
382 return 4;
383 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
384 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
385 case AMDGPU::S_LOAD_DWORDX8_IMM:
386 return 8;
387 case AMDGPU::DS_READ_B32:
388 case AMDGPU::DS_READ_B32_gfx9:
389 case AMDGPU::DS_WRITE_B32:
390 case AMDGPU::DS_WRITE_B32_gfx9:
391 return 1;
392 case AMDGPU::DS_READ_B64:
393 case AMDGPU::DS_READ_B64_gfx9:
394 case AMDGPU::DS_WRITE_B64:
395 case AMDGPU::DS_WRITE_B64_gfx9:
396 return 2;
397 default:
398 return 0;
399 }
400}
401
402/// Maps instruction opcode to enum InstClassEnum.
403static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
404 switch (Opc) {
405 default:
406 if (TII.isMUBUF(Opc)) {
407 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
408 default:
409 return UNKNOWN;
410 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
411 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
412 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
413 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
414 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
415 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
416 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
417 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
418 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
419 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
420 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
421 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
422 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
423 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
424 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
425 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
426 return BUFFER_LOAD;
427 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
428 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
429 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
430 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
431 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
432 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
433 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
434 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
435 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
436 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
437 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
438 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
439 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
440 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
441 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
442 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
443 return BUFFER_STORE;
444 }
445 }
446 if (TII.isImage(Opc)) {
447 // Ignore instructions encoded without vaddr.
448 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
449 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
450 return UNKNOWN;
451 // Ignore BVH instructions
452 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
453 return UNKNOWN;
454 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
455 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
456 TII.isGather4(Opc))
457 return UNKNOWN;
458 return MIMG;
459 }
460 if (TII.isMTBUF(Opc)) {
461 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
462 default:
463 return UNKNOWN;
464 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
465 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
466 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
467 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
468 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
469 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
470 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
471 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
472 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
473 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
474 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
475 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
476 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
477 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
478 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
479 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
480 return TBUFFER_LOAD;
481 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
482 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
483 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
484 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
485 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
486 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
487 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
488 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
489 return TBUFFER_STORE;
490 }
491 }
492 return UNKNOWN;
493 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
494 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
495 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
496 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
497 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
498 return S_BUFFER_LOAD_IMM;
499 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
500 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
501 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
502 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
503 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
504 return S_BUFFER_LOAD_SGPR_IMM;
505 case AMDGPU::S_LOAD_DWORD_IMM:
506 case AMDGPU::S_LOAD_DWORDX2_IMM:
507 case AMDGPU::S_LOAD_DWORDX3_IMM:
508 case AMDGPU::S_LOAD_DWORDX4_IMM:
509 case AMDGPU::S_LOAD_DWORDX8_IMM:
510 return S_LOAD_IMM;
511 case AMDGPU::DS_READ_B32:
512 case AMDGPU::DS_READ_B32_gfx9:
513 case AMDGPU::DS_READ_B64:
514 case AMDGPU::DS_READ_B64_gfx9:
515 return DS_READ;
516 case AMDGPU::DS_WRITE_B32:
517 case AMDGPU::DS_WRITE_B32_gfx9:
518 case AMDGPU::DS_WRITE_B64:
519 case AMDGPU::DS_WRITE_B64_gfx9:
520 return DS_WRITE;
521 case AMDGPU::GLOBAL_LOAD_DWORD:
522 case AMDGPU::GLOBAL_LOAD_DWORDX2:
523 case AMDGPU::GLOBAL_LOAD_DWORDX3:
524 case AMDGPU::GLOBAL_LOAD_DWORDX4:
525 case AMDGPU::FLAT_LOAD_DWORD:
526 case AMDGPU::FLAT_LOAD_DWORDX2:
527 case AMDGPU::FLAT_LOAD_DWORDX3:
528 case AMDGPU::FLAT_LOAD_DWORDX4:
529 return FLAT_LOAD;
530 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
531 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
532 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
533 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
534 return GLOBAL_LOAD_SADDR;
535 case AMDGPU::GLOBAL_STORE_DWORD:
536 case AMDGPU::GLOBAL_STORE_DWORDX2:
537 case AMDGPU::GLOBAL_STORE_DWORDX3:
538 case AMDGPU::GLOBAL_STORE_DWORDX4:
539 case AMDGPU::FLAT_STORE_DWORD:
540 case AMDGPU::FLAT_STORE_DWORDX2:
541 case AMDGPU::FLAT_STORE_DWORDX3:
542 case AMDGPU::FLAT_STORE_DWORDX4:
543 return FLAT_STORE;
544 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
545 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
546 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
547 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
548 return GLOBAL_STORE_SADDR;
549 }
550}
551
552/// Determines instruction subclass from opcode. Only instructions
553/// of the same subclass can be merged together. The merged instruction may have
554/// a different subclass but must have the same class.
555static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
556 switch (Opc) {
557 default:
558 if (TII.isMUBUF(Opc))
559 return AMDGPU::getMUBUFBaseOpcode(Opc);
560 if (TII.isImage(Opc)) {
562 assert(Info);
563 return Info->BaseOpcode;
564 }
565 if (TII.isMTBUF(Opc))
566 return AMDGPU::getMTBUFBaseOpcode(Opc);
567 return -1;
568 case AMDGPU::DS_READ_B32:
569 case AMDGPU::DS_READ_B32_gfx9:
570 case AMDGPU::DS_READ_B64:
571 case AMDGPU::DS_READ_B64_gfx9:
572 case AMDGPU::DS_WRITE_B32:
573 case AMDGPU::DS_WRITE_B32_gfx9:
574 case AMDGPU::DS_WRITE_B64:
575 case AMDGPU::DS_WRITE_B64_gfx9:
576 return Opc;
577 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
578 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
579 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
580 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
581 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
582 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
583 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
584 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
585 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
586 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
587 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
588 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
589 case AMDGPU::S_LOAD_DWORD_IMM:
590 case AMDGPU::S_LOAD_DWORDX2_IMM:
591 case AMDGPU::S_LOAD_DWORDX3_IMM:
592 case AMDGPU::S_LOAD_DWORDX4_IMM:
593 case AMDGPU::S_LOAD_DWORDX8_IMM:
594 return AMDGPU::S_LOAD_DWORD_IMM;
595 case AMDGPU::GLOBAL_LOAD_DWORD:
596 case AMDGPU::GLOBAL_LOAD_DWORDX2:
597 case AMDGPU::GLOBAL_LOAD_DWORDX3:
598 case AMDGPU::GLOBAL_LOAD_DWORDX4:
599 case AMDGPU::FLAT_LOAD_DWORD:
600 case AMDGPU::FLAT_LOAD_DWORDX2:
601 case AMDGPU::FLAT_LOAD_DWORDX3:
602 case AMDGPU::FLAT_LOAD_DWORDX4:
603 return AMDGPU::FLAT_LOAD_DWORD;
604 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
605 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
606 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
607 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
608 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
609 case AMDGPU::GLOBAL_STORE_DWORD:
610 case AMDGPU::GLOBAL_STORE_DWORDX2:
611 case AMDGPU::GLOBAL_STORE_DWORDX3:
612 case AMDGPU::GLOBAL_STORE_DWORDX4:
613 case AMDGPU::FLAT_STORE_DWORD:
614 case AMDGPU::FLAT_STORE_DWORDX2:
615 case AMDGPU::FLAT_STORE_DWORDX3:
616 case AMDGPU::FLAT_STORE_DWORDX4:
617 return AMDGPU::FLAT_STORE_DWORD;
618 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
619 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
620 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
621 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
622 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
623 }
624}
625
626// GLOBAL loads and stores are classified as FLAT initially. If both combined
627// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
628// If either or both instructions are non segment specific FLAT the resulting
629// combined operation will be FLAT, potentially promoting one of the GLOBAL
630// operations to FLAT.
631// For other instructions return the original unmodified class.
632InstClassEnum
633SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
634 const CombineInfo &Paired) {
635 assert(CI.InstClass == Paired.InstClass);
636
637 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
639 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
640
641 return CI.InstClass;
642}
643
644static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
645 AddressRegs Result;
646
647 if (TII.isMUBUF(Opc)) {
649 Result.VAddr = true;
651 Result.SRsrc = true;
653 Result.SOffset = true;
654
655 return Result;
656 }
657
658 if (TII.isImage(Opc)) {
659 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
660 if (VAddr0Idx >= 0) {
661 int RsrcName =
662 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
663 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
664 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
665 } else {
666 Result.VAddr = true;
667 }
668 Result.SRsrc = true;
670 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
671 Result.SSamp = true;
672
673 return Result;
674 }
675 if (TII.isMTBUF(Opc)) {
677 Result.VAddr = true;
679 Result.SRsrc = true;
681 Result.SOffset = true;
682
683 return Result;
684 }
685
686 switch (Opc) {
687 default:
688 return Result;
689 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
690 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
691 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
692 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
693 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
694 Result.SOffset = true;
695 [[fallthrough]];
696 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
697 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
698 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
699 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
700 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
701 case AMDGPU::S_LOAD_DWORD_IMM:
702 case AMDGPU::S_LOAD_DWORDX2_IMM:
703 case AMDGPU::S_LOAD_DWORDX3_IMM:
704 case AMDGPU::S_LOAD_DWORDX4_IMM:
705 case AMDGPU::S_LOAD_DWORDX8_IMM:
706 Result.SBase = true;
707 return Result;
708 case AMDGPU::DS_READ_B32:
709 case AMDGPU::DS_READ_B64:
710 case AMDGPU::DS_READ_B32_gfx9:
711 case AMDGPU::DS_READ_B64_gfx9:
712 case AMDGPU::DS_WRITE_B32:
713 case AMDGPU::DS_WRITE_B64:
714 case AMDGPU::DS_WRITE_B32_gfx9:
715 case AMDGPU::DS_WRITE_B64_gfx9:
716 Result.Addr = true;
717 return Result;
718 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
719 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
720 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
721 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
722 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
723 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
724 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
725 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
726 Result.SAddr = true;
727 [[fallthrough]];
728 case AMDGPU::GLOBAL_LOAD_DWORD:
729 case AMDGPU::GLOBAL_LOAD_DWORDX2:
730 case AMDGPU::GLOBAL_LOAD_DWORDX3:
731 case AMDGPU::GLOBAL_LOAD_DWORDX4:
732 case AMDGPU::GLOBAL_STORE_DWORD:
733 case AMDGPU::GLOBAL_STORE_DWORDX2:
734 case AMDGPU::GLOBAL_STORE_DWORDX3:
735 case AMDGPU::GLOBAL_STORE_DWORDX4:
736 case AMDGPU::FLAT_LOAD_DWORD:
737 case AMDGPU::FLAT_LOAD_DWORDX2:
738 case AMDGPU::FLAT_LOAD_DWORDX3:
739 case AMDGPU::FLAT_LOAD_DWORDX4:
740 case AMDGPU::FLAT_STORE_DWORD:
741 case AMDGPU::FLAT_STORE_DWORDX2:
742 case AMDGPU::FLAT_STORE_DWORDX3:
743 case AMDGPU::FLAT_STORE_DWORDX4:
744 Result.VAddr = true;
745 return Result;
746 }
747}
748
749void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
750 const SILoadStoreOptimizer &LSO) {
751 I = MI;
752 unsigned Opc = MI->getOpcode();
753 InstClass = getInstClass(Opc, *LSO.TII);
754
755 if (InstClass == UNKNOWN)
756 return;
757
758 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
759
760 switch (InstClass) {
761 case DS_READ:
762 EltSize =
763 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
764 : 4;
765 break;
766 case DS_WRITE:
767 EltSize =
768 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
769 : 4;
770 break;
771 case S_BUFFER_LOAD_IMM:
772 case S_BUFFER_LOAD_SGPR_IMM:
773 case S_LOAD_IMM:
774 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
775 break;
776 default:
777 EltSize = 4;
778 break;
779 }
780
781 if (InstClass == MIMG) {
782 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
783 // Offset is not considered for MIMG instructions.
784 Offset = 0;
785 } else {
786 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
787 Offset = I->getOperand(OffsetIdx).getImm();
788 }
789
790 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
791 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
792
793 Width = getOpcodeWidth(*I, *LSO.TII);
794
795 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
796 Offset &= 0xffff;
797 } else if (InstClass != MIMG) {
798 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
799 }
800
801 AddressRegs Regs = getRegs(Opc, *LSO.TII);
802 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
803
804 NumAddresses = 0;
805 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
806 AddrIdx[NumAddresses++] =
807 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
808 if (Regs.Addr)
809 AddrIdx[NumAddresses++] =
810 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
811 if (Regs.SBase)
812 AddrIdx[NumAddresses++] =
813 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
814 if (Regs.SRsrc)
815 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
816 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
817 if (Regs.SOffset)
818 AddrIdx[NumAddresses++] =
819 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
820 if (Regs.SAddr)
821 AddrIdx[NumAddresses++] =
822 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
823 if (Regs.VAddr)
824 AddrIdx[NumAddresses++] =
825 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
826 if (Regs.SSamp)
827 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
828 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
829 assert(NumAddresses <= MaxAddressRegs);
830
831 for (unsigned J = 0; J < NumAddresses; J++)
832 AddrReg[J] = &I->getOperand(AddrIdx[J]);
833}
834
835} // end anonymous namespace.
836
837INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
838 "SI Load Store Optimizer", false, false)
840INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
842
843char SILoadStoreOptimizer::ID = 0;
844
845char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
846
848 return new SILoadStoreOptimizer();
849}
850
852 DenseSet<Register> &RegDefs,
853 DenseSet<Register> &RegUses) {
854 for (const auto &Op : MI.operands()) {
855 if (!Op.isReg())
856 continue;
857 if (Op.isDef())
858 RegDefs.insert(Op.getReg());
859 if (Op.readsReg())
860 RegUses.insert(Op.getReg());
861 }
862}
863
864bool SILoadStoreOptimizer::canSwapInstructions(
865 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
866 const MachineInstr &A, const MachineInstr &B) const {
867 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
868 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
869 return false;
870 for (const auto &BOp : B.operands()) {
871 if (!BOp.isReg())
872 continue;
873 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
874 return false;
875 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
876 return false;
877 }
878 return true;
879}
880
881// Given that \p CI and \p Paired are adjacent memory operations produce a new
882// MMO for the combined operation with a new access size.
884SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
885 const CombineInfo &Paired) {
886 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
887 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
888
889 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
890
891 // A base pointer for the combined operation is the same as the leading
892 // operation's pointer.
893 if (Paired < CI)
894 std::swap(MMOa, MMOb);
895
896 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
897 // If merging FLAT and GLOBAL set address space to FLAT.
899 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
900
901 MachineFunction *MF = CI.I->getMF();
902 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
903}
904
905bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
906 const SIInstrInfo &TII,
907 const CombineInfo &Paired) {
908 assert(CI.InstClass == MIMG);
909
910 // Ignore instructions with tfe/lwe set.
911 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
912 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
913
914 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
915 return false;
916
917 // Check other optional immediate operands for equality.
918 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
919 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
920 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
921
922 for (auto op : OperandsToMatch) {
923 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
924 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
925 return false;
926 if (Idx != -1 &&
927 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
928 return false;
929 }
930
931 // Check DMask for overlaps.
932 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
933 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
934
935 if (!MaxMask)
936 return false;
937
938 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
939 if ((1u << AllowedBitsForMin) <= MinMask)
940 return false;
941
942 return true;
943}
944
945static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
946 unsigned ComponentCount,
947 const GCNSubtarget &STI) {
948 if (ComponentCount > 4)
949 return 0;
950
951 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
953 if (!OldFormatInfo)
954 return 0;
955
956 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
958 ComponentCount,
959 OldFormatInfo->NumFormat, STI);
960
961 if (!NewFormatInfo)
962 return 0;
963
964 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
965 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
966
967 return NewFormatInfo->Format;
968}
969
970// Return the value in the inclusive range [Lo,Hi] that is aligned to the
971// highest power of two. Note that the result is well defined for all inputs
972// including corner cases like:
973// - if Lo == Hi, return that value
974// - if Lo == 0, return 0 (even though the "- 1" below underflows
975// - if Lo > Hi, return 0 (as if the range wrapped around)
977 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
978}
979
980bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
981 const GCNSubtarget &STI,
982 CombineInfo &Paired,
983 bool Modify) {
984 assert(CI.InstClass != MIMG);
985
986 // XXX - Would the same offset be OK? Is there any reason this would happen or
987 // be useful?
988 if (CI.Offset == Paired.Offset)
989 return false;
990
991 // This won't be valid if the offset isn't aligned.
992 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
993 return false;
994
995 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
996
999 if (!Info0)
1000 return false;
1002 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1003 if (!Info1)
1004 return false;
1005
1006 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1007 Info0->NumFormat != Info1->NumFormat)
1008 return false;
1009
1010 // TODO: Should be possible to support more formats, but if format loads
1011 // are not dword-aligned, the merged load might not be valid.
1012 if (Info0->BitsPerComp != 32)
1013 return false;
1014
1015 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1016 return false;
1017 }
1018
1019 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1020 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1021 CI.UseST64 = false;
1022 CI.BaseOff = 0;
1023
1024 // Handle all non-DS instructions.
1025 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1026 if (EltOffset0 + CI.Width != EltOffset1 &&
1027 EltOffset1 + Paired.Width != EltOffset0)
1028 return false;
1029 if (CI.CPol != Paired.CPol)
1030 return false;
1031 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1032 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1033 // Reject cases like:
1034 // dword + dwordx2 -> dwordx3
1035 // dword + dwordx3 -> dwordx4
1036 // If we tried to combine these cases, we would fail to extract a subreg
1037 // for the result of the second load due to SGPR alignment requirements.
1038 if (CI.Width != Paired.Width &&
1039 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1040 return false;
1041 }
1042 return true;
1043 }
1044
1045 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1046 // the stride 64 versions.
1047 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1048 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1049 if (Modify) {
1050 CI.Offset = EltOffset0 / 64;
1051 Paired.Offset = EltOffset1 / 64;
1052 CI.UseST64 = true;
1053 }
1054 return true;
1055 }
1056
1057 // Check if the new offsets fit in the reduced 8-bit range.
1058 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1059 if (Modify) {
1060 CI.Offset = EltOffset0;
1061 Paired.Offset = EltOffset1;
1062 }
1063 return true;
1064 }
1065
1066 // Try to shift base address to decrease offsets.
1067 uint32_t Min = std::min(EltOffset0, EltOffset1);
1068 uint32_t Max = std::max(EltOffset0, EltOffset1);
1069
1070 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1071 if (((Max - Min) & ~Mask) == 0) {
1072 if (Modify) {
1073 // From the range of values we could use for BaseOff, choose the one that
1074 // is aligned to the highest power of two, to maximise the chance that
1075 // the same offset can be reused for other load/store pairs.
1076 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1077 // Copy the low bits of the offsets, so that when we adjust them by
1078 // subtracting BaseOff they will be multiples of 64.
1079 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1080 CI.BaseOff = BaseOff * CI.EltSize;
1081 CI.Offset = (EltOffset0 - BaseOff) / 64;
1082 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1083 CI.UseST64 = true;
1084 }
1085 return true;
1086 }
1087
1088 if (isUInt<8>(Max - Min)) {
1089 if (Modify) {
1090 // From the range of values we could use for BaseOff, choose the one that
1091 // is aligned to the highest power of two, to maximise the chance that
1092 // the same offset can be reused for other load/store pairs.
1093 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1094 CI.BaseOff = BaseOff * CI.EltSize;
1095 CI.Offset = EltOffset0 - BaseOff;
1096 Paired.Offset = EltOffset1 - BaseOff;
1097 }
1098 return true;
1099 }
1100
1101 return false;
1102}
1103
1104bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1105 const CombineInfo &CI,
1106 const CombineInfo &Paired) {
1107 const unsigned Width = (CI.Width + Paired.Width);
1108 switch (CI.InstClass) {
1109 default:
1110 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1111 case S_BUFFER_LOAD_IMM:
1112 case S_BUFFER_LOAD_SGPR_IMM:
1113 case S_LOAD_IMM:
1114 switch (Width) {
1115 default:
1116 return false;
1117 case 2:
1118 case 4:
1119 case 8:
1120 return true;
1121 case 3:
1122 return STM.hasScalarDwordx3Loads();
1123 }
1124 }
1125}
1126
1127const TargetRegisterClass *
1128SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1129 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1130 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1131 }
1132 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1133 return TRI->getRegClassForReg(*MRI, Src->getReg());
1134 }
1135 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1136 return TRI->getRegClassForReg(*MRI, Src->getReg());
1137 }
1138 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1139 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1140 }
1141 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1142 return TRI->getRegClassForReg(*MRI, Src->getReg());
1143 }
1144 return nullptr;
1145}
1146
1147/// This function assumes that CI comes before Paired in a basic block. Return
1148/// an insertion point for the merged instruction or nullptr on failure.
1149SILoadStoreOptimizer::CombineInfo *
1150SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1151 CombineInfo &Paired) {
1152 // If another instruction has already been merged into CI, it may now be a
1153 // type that we can't do any further merging into.
1154 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1155 return nullptr;
1156 assert(CI.InstClass == Paired.InstClass);
1157
1158 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1159 getInstSubclass(Paired.I->getOpcode(), *TII))
1160 return nullptr;
1161
1162 // Check both offsets (or masks for MIMG) can be combined and fit in the
1163 // reduced range.
1164 if (CI.InstClass == MIMG) {
1165 if (!dmasksCanBeCombined(CI, *TII, Paired))
1166 return nullptr;
1167 } else {
1168 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1169 return nullptr;
1170 }
1171
1172 DenseSet<Register> RegDefs;
1173 DenseSet<Register> RegUses;
1174 CombineInfo *Where;
1175 if (CI.I->mayLoad()) {
1176 // Try to hoist Paired up to CI.
1177 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1178 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1179 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1180 return nullptr;
1181 }
1182 Where = &CI;
1183 } else {
1184 // Try to sink CI down to Paired.
1185 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1186 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1187 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1188 return nullptr;
1189 }
1190 Where = &Paired;
1191 }
1192
1193 // Call offsetsCanBeCombined with modify = true so that the offsets are
1194 // correct for the new instruction. This should return true, because
1195 // this function should only be called on CombineInfo objects that
1196 // have already been confirmed to be mergeable.
1197 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1198 offsetsCanBeCombined(CI, *STM, Paired, true);
1199 return Where;
1200}
1201
1202// Copy the merged load result from DestReg to the original dest regs of CI and
1203// Paired.
1204void SILoadStoreOptimizer::copyToDestRegs(
1205 CombineInfo &CI, CombineInfo &Paired,
1206 MachineBasicBlock::iterator InsertBefore, int OpName,
1207 Register DestReg) const {
1208 MachineBasicBlock *MBB = CI.I->getParent();
1209 DebugLoc DL = CI.I->getDebugLoc();
1210
1211 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1212
1213 // Copy to the old destination registers.
1214 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1215 const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1216 const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1217
1218 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1219 .add(*Dest0) // Copy to same destination including flags and sub reg.
1220 .addReg(DestReg, 0, SubRegIdx0);
1221 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1222 .add(*Dest1)
1223 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1224}
1225
1226// Return a register for the source of the merged store after copying the
1227// original source regs of CI and Paired into it.
1229SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1230 MachineBasicBlock::iterator InsertBefore,
1231 int OpName) const {
1232 MachineBasicBlock *MBB = CI.I->getParent();
1233 DebugLoc DL = CI.I->getDebugLoc();
1234
1235 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1236
1237 // Copy to the new source register.
1238 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1239 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1240
1241 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1242 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1243
1244 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1245 .add(*Src0)
1246 .addImm(SubRegIdx0)
1247 .add(*Src1)
1248 .addImm(SubRegIdx1);
1249
1250 return SrcReg;
1251}
1252
1253unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1254 if (STM->ldsRequiresM0Init())
1255 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1256 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1257}
1258
1259unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1260 if (STM->ldsRequiresM0Init())
1261 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1262
1263 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1264 : AMDGPU::DS_READ2ST64_B64_gfx9;
1265}
1266
1268SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1269 MachineBasicBlock::iterator InsertBefore) {
1270 MachineBasicBlock *MBB = CI.I->getParent();
1271
1272 // Be careful, since the addresses could be subregisters themselves in weird
1273 // cases, like vectors of pointers.
1274 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1275
1276 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1277 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1278 unsigned Opc =
1279 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1280
1281 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1282 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1283
1284 const MCInstrDesc &Read2Desc = TII->get(Opc);
1285
1286 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1287 Register DestReg = MRI->createVirtualRegister(SuperRC);
1288
1289 DebugLoc DL = CI.I->getDebugLoc();
1290
1291 Register BaseReg = AddrReg->getReg();
1292 unsigned BaseSubReg = AddrReg->getSubReg();
1293 unsigned BaseRegFlags = 0;
1294 if (CI.BaseOff) {
1295 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1296 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1297 .addImm(CI.BaseOff);
1298
1299 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1300 BaseRegFlags = RegState::Kill;
1301
1302 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1303 .addReg(ImmReg)
1304 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1305 .addImm(0); // clamp bit
1306 BaseSubReg = 0;
1307 }
1308
1309 MachineInstrBuilder Read2 =
1310 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1311 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1312 .addImm(NewOffset0) // offset0
1313 .addImm(NewOffset1) // offset1
1314 .addImm(0) // gds
1315 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1316
1317 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1318
1319 CI.I->eraseFromParent();
1320 Paired.I->eraseFromParent();
1321
1322 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1323 return Read2;
1324}
1325
1326unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1327 if (STM->ldsRequiresM0Init())
1328 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1329 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1330 : AMDGPU::DS_WRITE2_B64_gfx9;
1331}
1332
1333unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1334 if (STM->ldsRequiresM0Init())
1335 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1336 : AMDGPU::DS_WRITE2ST64_B64;
1337
1338 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1339 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1340}
1341
1342MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1343 CombineInfo &CI, CombineInfo &Paired,
1344 MachineBasicBlock::iterator InsertBefore) {
1345 MachineBasicBlock *MBB = CI.I->getParent();
1346
1347 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1348 // sure we preserve the subregister index and any register flags set on them.
1349 const MachineOperand *AddrReg =
1350 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1351 const MachineOperand *Data0 =
1352 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1353 const MachineOperand *Data1 =
1354 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1355
1356 unsigned NewOffset0 = CI.Offset;
1357 unsigned NewOffset1 = Paired.Offset;
1358 unsigned Opc =
1359 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1360
1361 if (NewOffset0 > NewOffset1) {
1362 // Canonicalize the merged instruction so the smaller offset comes first.
1363 std::swap(NewOffset0, NewOffset1);
1364 std::swap(Data0, Data1);
1365 }
1366
1367 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1368 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1369
1370 const MCInstrDesc &Write2Desc = TII->get(Opc);
1371 DebugLoc DL = CI.I->getDebugLoc();
1372
1373 Register BaseReg = AddrReg->getReg();
1374 unsigned BaseSubReg = AddrReg->getSubReg();
1375 unsigned BaseRegFlags = 0;
1376 if (CI.BaseOff) {
1377 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1378 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1379 .addImm(CI.BaseOff);
1380
1381 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1382 BaseRegFlags = RegState::Kill;
1383
1384 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1385 .addReg(ImmReg)
1386 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1387 .addImm(0); // clamp bit
1388 BaseSubReg = 0;
1389 }
1390
1391 MachineInstrBuilder Write2 =
1392 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1393 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1394 .add(*Data0) // data0
1395 .add(*Data1) // data1
1396 .addImm(NewOffset0) // offset0
1397 .addImm(NewOffset1) // offset1
1398 .addImm(0) // gds
1399 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1400
1401 CI.I->eraseFromParent();
1402 Paired.I->eraseFromParent();
1403
1404 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1405 return Write2;
1406}
1407
1409SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1410 MachineBasicBlock::iterator InsertBefore) {
1411 MachineBasicBlock *MBB = CI.I->getParent();
1412 DebugLoc DL = CI.I->getDebugLoc();
1413 const unsigned Opcode = getNewOpcode(CI, Paired);
1414
1415 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1416
1417 Register DestReg = MRI->createVirtualRegister(SuperRC);
1418 unsigned MergedDMask = CI.DMask | Paired.DMask;
1419 unsigned DMaskIdx =
1420 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1421
1422 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1423 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1424 if (I == DMaskIdx)
1425 MIB.addImm(MergedDMask);
1426 else
1427 MIB.add((*CI.I).getOperand(I));
1428 }
1429
1430 // It shouldn't be possible to get this far if the two instructions
1431 // don't have a single memoperand, because MachineInstr::mayAlias()
1432 // will return true if this is the case.
1433 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1434
1435 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1436
1437 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1438
1439 CI.I->eraseFromParent();
1440 Paired.I->eraseFromParent();
1441 return New;
1442}
1443
1444MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1445 CombineInfo &CI, CombineInfo &Paired,
1446 MachineBasicBlock::iterator InsertBefore) {
1447 MachineBasicBlock *MBB = CI.I->getParent();
1448 DebugLoc DL = CI.I->getDebugLoc();
1449 const unsigned Opcode = getNewOpcode(CI, Paired);
1450
1451 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1452
1453 Register DestReg = MRI->createVirtualRegister(SuperRC);
1454 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1455
1456 // It shouldn't be possible to get this far if the two instructions
1457 // don't have a single memoperand, because MachineInstr::mayAlias()
1458 // will return true if this is the case.
1459 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1460
1462 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1463 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1464 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1465 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1466 New.addImm(MergedOffset);
1467 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1468
1469 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1470
1471 CI.I->eraseFromParent();
1472 Paired.I->eraseFromParent();
1473 return New;
1474}
1475
1476MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1477 CombineInfo &CI, CombineInfo &Paired,
1478 MachineBasicBlock::iterator InsertBefore) {
1479 MachineBasicBlock *MBB = CI.I->getParent();
1480 DebugLoc DL = CI.I->getDebugLoc();
1481
1482 const unsigned Opcode = getNewOpcode(CI, Paired);
1483
1484 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1485
1486 // Copy to the new source register.
1487 Register DestReg = MRI->createVirtualRegister(SuperRC);
1488 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1489
1490 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1491
1492 AddressRegs Regs = getRegs(Opcode, *TII);
1493
1494 if (Regs.VAddr)
1495 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1496
1497 // It shouldn't be possible to get this far if the two instructions
1498 // don't have a single memoperand, because MachineInstr::mayAlias()
1499 // will return true if this is the case.
1500 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1501
1502 MachineInstr *New =
1503 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1504 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1505 .addImm(MergedOffset) // offset
1506 .addImm(CI.CPol) // cpol
1507 .addImm(0) // swz
1508 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1509
1510 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1511
1512 CI.I->eraseFromParent();
1513 Paired.I->eraseFromParent();
1514 return New;
1515}
1516
1517MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1518 CombineInfo &CI, CombineInfo &Paired,
1519 MachineBasicBlock::iterator InsertBefore) {
1520 MachineBasicBlock *MBB = CI.I->getParent();
1521 DebugLoc DL = CI.I->getDebugLoc();
1522
1523 const unsigned Opcode = getNewOpcode(CI, Paired);
1524
1525 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1526
1527 // Copy to the new source register.
1528 Register DestReg = MRI->createVirtualRegister(SuperRC);
1529 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1530
1531 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1532
1533 AddressRegs Regs = getRegs(Opcode, *TII);
1534
1535 if (Regs.VAddr)
1536 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1537
1538 unsigned JoinedFormat =
1539 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1540
1541 // It shouldn't be possible to get this far if the two instructions
1542 // don't have a single memoperand, because MachineInstr::mayAlias()
1543 // will return true if this is the case.
1544 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1545
1546 MachineInstr *New =
1547 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1548 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1549 .addImm(MergedOffset) // offset
1550 .addImm(JoinedFormat) // format
1551 .addImm(CI.CPol) // cpol
1552 .addImm(0) // swz
1553 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1554
1555 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1556
1557 CI.I->eraseFromParent();
1558 Paired.I->eraseFromParent();
1559 return New;
1560}
1561
1562MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1563 CombineInfo &CI, CombineInfo &Paired,
1564 MachineBasicBlock::iterator InsertBefore) {
1565 MachineBasicBlock *MBB = CI.I->getParent();
1566 DebugLoc DL = CI.I->getDebugLoc();
1567
1568 const unsigned Opcode = getNewOpcode(CI, Paired);
1569
1570 Register SrcReg =
1571 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1572
1573 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1574 .addReg(SrcReg, RegState::Kill);
1575
1576 AddressRegs Regs = getRegs(Opcode, *TII);
1577
1578 if (Regs.VAddr)
1579 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1580
1581 unsigned JoinedFormat =
1582 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1583
1584 // It shouldn't be possible to get this far if the two instructions
1585 // don't have a single memoperand, because MachineInstr::mayAlias()
1586 // will return true if this is the case.
1587 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1588
1589 MachineInstr *New =
1590 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1591 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1592 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1593 .addImm(JoinedFormat) // format
1594 .addImm(CI.CPol) // cpol
1595 .addImm(0) // swz
1596 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1597
1598 CI.I->eraseFromParent();
1599 Paired.I->eraseFromParent();
1600 return New;
1601}
1602
1603MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1604 CombineInfo &CI, CombineInfo &Paired,
1605 MachineBasicBlock::iterator InsertBefore) {
1606 MachineBasicBlock *MBB = CI.I->getParent();
1607 DebugLoc DL = CI.I->getDebugLoc();
1608
1609 const unsigned Opcode = getNewOpcode(CI, Paired);
1610
1611 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1612 Register DestReg = MRI->createVirtualRegister(SuperRC);
1613
1614 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1615
1616 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1617 MIB.add(*SAddr);
1618
1619 MachineInstr *New =
1620 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1621 .addImm(std::min(CI.Offset, Paired.Offset))
1622 .addImm(CI.CPol)
1623 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1624
1625 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1626
1627 CI.I->eraseFromParent();
1628 Paired.I->eraseFromParent();
1629 return New;
1630}
1631
1632MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1633 CombineInfo &CI, CombineInfo &Paired,
1634 MachineBasicBlock::iterator InsertBefore) {
1635 MachineBasicBlock *MBB = CI.I->getParent();
1636 DebugLoc DL = CI.I->getDebugLoc();
1637
1638 const unsigned Opcode = getNewOpcode(CI, Paired);
1639
1640 Register SrcReg =
1641 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1642
1643 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1644 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1645 .addReg(SrcReg, RegState::Kill);
1646
1647 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1648 MIB.add(*SAddr);
1649
1650 MachineInstr *New =
1651 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1652 .addImm(CI.CPol)
1653 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1654
1655 CI.I->eraseFromParent();
1656 Paired.I->eraseFromParent();
1657 return New;
1658}
1659
1660unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1661 const CombineInfo &Paired) {
1662 const unsigned Width = CI.Width + Paired.Width;
1663
1664 switch (getCommonInstClass(CI, Paired)) {
1665 default:
1666 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1667 // FIXME: Handle d16 correctly
1668 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1669 Width);
1670 case TBUFFER_LOAD:
1671 case TBUFFER_STORE:
1672 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1673 Width);
1674
1675 case UNKNOWN:
1676 llvm_unreachable("Unknown instruction class");
1677 case S_BUFFER_LOAD_IMM:
1678 switch (Width) {
1679 default:
1680 return 0;
1681 case 2:
1682 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1683 case 3:
1684 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1685 case 4:
1686 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1687 case 8:
1688 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1689 }
1690 case S_BUFFER_LOAD_SGPR_IMM:
1691 switch (Width) {
1692 default:
1693 return 0;
1694 case 2:
1695 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1696 case 3:
1697 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1698 case 4:
1699 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1700 case 8:
1701 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1702 }
1703 case S_LOAD_IMM:
1704 switch (Width) {
1705 default:
1706 return 0;
1707 case 2:
1708 return AMDGPU::S_LOAD_DWORDX2_IMM;
1709 case 3:
1710 return AMDGPU::S_LOAD_DWORDX3_IMM;
1711 case 4:
1712 return AMDGPU::S_LOAD_DWORDX4_IMM;
1713 case 8:
1714 return AMDGPU::S_LOAD_DWORDX8_IMM;
1715 }
1716 case GLOBAL_LOAD:
1717 switch (Width) {
1718 default:
1719 return 0;
1720 case 2:
1721 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1722 case 3:
1723 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1724 case 4:
1725 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1726 }
1727 case GLOBAL_LOAD_SADDR:
1728 switch (Width) {
1729 default:
1730 return 0;
1731 case 2:
1732 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1733 case 3:
1734 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1735 case 4:
1736 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1737 }
1738 case GLOBAL_STORE:
1739 switch (Width) {
1740 default:
1741 return 0;
1742 case 2:
1743 return AMDGPU::GLOBAL_STORE_DWORDX2;
1744 case 3:
1745 return AMDGPU::GLOBAL_STORE_DWORDX3;
1746 case 4:
1747 return AMDGPU::GLOBAL_STORE_DWORDX4;
1748 }
1749 case GLOBAL_STORE_SADDR:
1750 switch (Width) {
1751 default:
1752 return 0;
1753 case 2:
1754 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1755 case 3:
1756 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1757 case 4:
1758 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1759 }
1760 case FLAT_LOAD:
1761 switch (Width) {
1762 default:
1763 return 0;
1764 case 2:
1765 return AMDGPU::FLAT_LOAD_DWORDX2;
1766 case 3:
1767 return AMDGPU::FLAT_LOAD_DWORDX3;
1768 case 4:
1769 return AMDGPU::FLAT_LOAD_DWORDX4;
1770 }
1771 case FLAT_STORE:
1772 switch (Width) {
1773 default:
1774 return 0;
1775 case 2:
1776 return AMDGPU::FLAT_STORE_DWORDX2;
1777 case 3:
1778 return AMDGPU::FLAT_STORE_DWORDX3;
1779 case 4:
1780 return AMDGPU::FLAT_STORE_DWORDX4;
1781 }
1782 case MIMG:
1783 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1784 "No overlaps");
1785 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1786 }
1787}
1788
1789std::pair<unsigned, unsigned>
1790SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1791 const CombineInfo &Paired) {
1792 assert((CI.InstClass != MIMG ||
1793 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1794 CI.Width + Paired.Width)) &&
1795 "No overlaps");
1796
1797 unsigned Idx0;
1798 unsigned Idx1;
1799
1800 static const unsigned Idxs[5][4] = {
1801 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1802 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1803 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1804 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1805 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1806 };
1807
1808 assert(CI.Width >= 1 && CI.Width <= 4);
1809 assert(Paired.Width >= 1 && Paired.Width <= 4);
1810
1811 if (Paired < CI) {
1812 Idx1 = Idxs[0][Paired.Width - 1];
1813 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1814 } else {
1815 Idx0 = Idxs[0][CI.Width - 1];
1816 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1817 }
1818
1819 return {Idx0, Idx1};
1820}
1821
1822const TargetRegisterClass *
1823SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1824 const CombineInfo &Paired) const {
1825 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1826 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1827 switch (CI.Width + Paired.Width) {
1828 default:
1829 return nullptr;
1830 case 2:
1831 return &AMDGPU::SReg_64_XEXECRegClass;
1832 case 3:
1833 return &AMDGPU::SGPR_96RegClass;
1834 case 4:
1835 return &AMDGPU::SGPR_128RegClass;
1836 case 8:
1837 return &AMDGPU::SGPR_256RegClass;
1838 case 16:
1839 return &AMDGPU::SGPR_512RegClass;
1840 }
1841 }
1842
1843 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1844 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1845 ? TRI->getAGPRClassForBitWidth(BitWidth)
1846 : TRI->getVGPRClassForBitWidth(BitWidth);
1847}
1848
1849MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1850 CombineInfo &CI, CombineInfo &Paired,
1851 MachineBasicBlock::iterator InsertBefore) {
1852 MachineBasicBlock *MBB = CI.I->getParent();
1853 DebugLoc DL = CI.I->getDebugLoc();
1854
1855 const unsigned Opcode = getNewOpcode(CI, Paired);
1856
1857 Register SrcReg =
1858 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1859
1860 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1861 .addReg(SrcReg, RegState::Kill);
1862
1863 AddressRegs Regs = getRegs(Opcode, *TII);
1864
1865 if (Regs.VAddr)
1866 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1867
1868
1869 // It shouldn't be possible to get this far if the two instructions
1870 // don't have a single memoperand, because MachineInstr::mayAlias()
1871 // will return true if this is the case.
1872 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1873
1874 MachineInstr *New =
1875 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1876 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1877 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1878 .addImm(CI.CPol) // cpol
1879 .addImm(0) // swz
1880 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1881
1882 CI.I->eraseFromParent();
1883 Paired.I->eraseFromParent();
1884 return New;
1885}
1886
1888SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1889 APInt V(32, Val, true);
1890 if (TII->isInlineConstant(V))
1891 return MachineOperand::CreateImm(Val);
1892
1893 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1894 MachineInstr *Mov =
1895 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1896 TII->get(AMDGPU::S_MOV_B32), Reg)
1897 .addImm(Val);
1898 (void)Mov;
1899 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1900 return MachineOperand::CreateReg(Reg, false);
1901}
1902
1903// Compute base address using Addr and return the final register.
1904Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1905 const MemAddress &Addr) const {
1906 MachineBasicBlock *MBB = MI.getParent();
1907 MachineBasicBlock::iterator MBBI = MI.getIterator();
1908 DebugLoc DL = MI.getDebugLoc();
1909
1910 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1911 Addr.Base.LoSubReg) &&
1912 "Expected 32-bit Base-Register-Low!!");
1913
1914 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1915 Addr.Base.HiSubReg) &&
1916 "Expected 32-bit Base-Register-Hi!!");
1917
1918 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1919 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1920 MachineOperand OffsetHi =
1921 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1922
1923 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1924 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1925 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1926
1927 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1928 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1929 MachineInstr *LoHalf =
1930 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1931 .addReg(CarryReg, RegState::Define)
1932 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1933 .add(OffsetLo)
1934 .addImm(0); // clamp bit
1935 (void)LoHalf;
1936 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1937
1938 MachineInstr *HiHalf =
1939 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1940 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1941 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1942 .add(OffsetHi)
1943 .addReg(CarryReg, RegState::Kill)
1944 .addImm(0); // clamp bit
1945 (void)HiHalf;
1946 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1947
1948 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1949 MachineInstr *FullBase =
1950 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1951 .addReg(DestSub0)
1952 .addImm(AMDGPU::sub0)
1953 .addReg(DestSub1)
1954 .addImm(AMDGPU::sub1);
1955 (void)FullBase;
1956 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1957
1958 return FullDestReg;
1959}
1960
1961// Update base and offset with the NewBase and NewOffset in MI.
1962void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1963 Register NewBase,
1964 int32_t NewOffset) const {
1965 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1966 Base->setReg(NewBase);
1967 Base->setIsKill(false);
1968 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1969}
1970
1971std::optional<int32_t>
1972SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1973 if (Op.isImm())
1974 return Op.getImm();
1975
1976 if (!Op.isReg())
1977 return std::nullopt;
1978
1979 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1980 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1981 !Def->getOperand(1).isImm())
1982 return std::nullopt;
1983
1984 return Def->getOperand(1).getImm();
1985}
1986
1987// Analyze Base and extracts:
1988// - 32bit base registers, subregisters
1989// - 64bit constant offset
1990// Expecting base computation as:
1991// %OFFSET0:sgpr_32 = S_MOV_B32 8000
1992// %LO:vgpr_32, %c:sreg_64_xexec =
1993// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1994// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1995// %Base:vreg_64 =
1996// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1997void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1998 MemAddress &Addr) const {
1999 if (!Base.isReg())
2000 return;
2001
2002 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2003 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2004 || Def->getNumOperands() != 5)
2005 return;
2006
2007 MachineOperand BaseLo = Def->getOperand(1);
2008 MachineOperand BaseHi = Def->getOperand(3);
2009 if (!BaseLo.isReg() || !BaseHi.isReg())
2010 return;
2011
2012 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2013 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2014
2015 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2016 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2017 return;
2018
2019 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2020 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2021
2022 auto Offset0P = extractConstOffset(*Src0);
2023 if (Offset0P)
2024 BaseLo = *Src1;
2025 else {
2026 if (!(Offset0P = extractConstOffset(*Src1)))
2027 return;
2028 BaseLo = *Src0;
2029 }
2030
2031 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2032 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2033
2034 if (Src0->isImm())
2035 std::swap(Src0, Src1);
2036
2037 if (!Src1->isImm())
2038 return;
2039
2040 uint64_t Offset1 = Src1->getImm();
2041 BaseHi = *Src0;
2042
2043 Addr.Base.LoReg = BaseLo.getReg();
2044 Addr.Base.HiReg = BaseHi.getReg();
2045 Addr.Base.LoSubReg = BaseLo.getSubReg();
2046 Addr.Base.HiSubReg = BaseHi.getSubReg();
2047 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2048}
2049
2050bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2052 MemInfoMap &Visited,
2053 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2054
2055 if (!(MI.mayLoad() ^ MI.mayStore()))
2056 return false;
2057
2058 // TODO: Support flat and scratch.
2059 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2060 return false;
2061
2062 if (MI.mayLoad() &&
2063 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2064 return false;
2065
2066 if (AnchorList.count(&MI))
2067 return false;
2068
2069 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2070
2071 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2072 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2073 return false;
2074 }
2075
2076 // Step1: Find the base-registers and a 64bit constant offset.
2077 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2078 MemAddress MAddr;
2079 if (!Visited.contains(&MI)) {
2080 processBaseWithConstOffset(Base, MAddr);
2081 Visited[&MI] = MAddr;
2082 } else
2083 MAddr = Visited[&MI];
2084
2085 if (MAddr.Offset == 0) {
2086 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2087 " constant offsets that can be promoted.\n";);
2088 return false;
2089 }
2090
2091 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2092 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2093
2094 // Step2: Traverse through MI's basic block and find an anchor(that has the
2095 // same base-registers) with the highest 13bit distance from MI's offset.
2096 // E.g. (64bit loads)
2097 // bb:
2098 // addr1 = &a + 4096; load1 = load(addr1, 0)
2099 // addr2 = &a + 6144; load2 = load(addr2, 0)
2100 // addr3 = &a + 8192; load3 = load(addr3, 0)
2101 // addr4 = &a + 10240; load4 = load(addr4, 0)
2102 // addr5 = &a + 12288; load5 = load(addr5, 0)
2103 //
2104 // Starting from the first load, the optimization will try to find a new base
2105 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2106 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2107 // as the new-base(anchor) because of the maximum distance which can
2108 // accommodate more intermediate bases presumably.
2109 //
2110 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2111 // (&a + 8192) for load1, load2, load4.
2112 // addr = &a + 8192
2113 // load1 = load(addr, -4096)
2114 // load2 = load(addr, -2048)
2115 // load3 = load(addr, 0)
2116 // load4 = load(addr, 2048)
2117 // addr5 = &a + 12288; load5 = load(addr5, 0)
2118 //
2119 MachineInstr *AnchorInst = nullptr;
2120 MemAddress AnchorAddr;
2121 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2123
2124 MachineBasicBlock *MBB = MI.getParent();
2126 MachineBasicBlock::iterator MBBI = MI.getIterator();
2127 ++MBBI;
2128 const SITargetLowering *TLI =
2129 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2130
2131 for ( ; MBBI != E; ++MBBI) {
2132 MachineInstr &MINext = *MBBI;
2133 // TODO: Support finding an anchor(with same base) from store addresses or
2134 // any other load addresses where the opcodes are different.
2135 if (MINext.getOpcode() != MI.getOpcode() ||
2136 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2137 continue;
2138
2139 const MachineOperand &BaseNext =
2140 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2141 MemAddress MAddrNext;
2142 if (!Visited.contains(&MINext)) {
2143 processBaseWithConstOffset(BaseNext, MAddrNext);
2144 Visited[&MINext] = MAddrNext;
2145 } else
2146 MAddrNext = Visited[&MINext];
2147
2148 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2149 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2150 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2151 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2152 continue;
2153
2154 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2155
2156 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2158 AM.HasBaseReg = true;
2159 AM.BaseOffs = Dist;
2160 if (TLI->isLegalGlobalAddressingMode(AM) &&
2161 (uint32_t)std::abs(Dist) > MaxDist) {
2162 MaxDist = std::abs(Dist);
2163
2164 AnchorAddr = MAddrNext;
2165 AnchorInst = &MINext;
2166 }
2167 }
2168
2169 if (AnchorInst) {
2170 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2171 AnchorInst->dump());
2172 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2173 << AnchorAddr.Offset << "\n\n");
2174
2175 // Instead of moving up, just re-compute anchor-instruction's base address.
2176 Register Base = computeBase(MI, AnchorAddr);
2177
2178 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2179 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2180
2181 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2183 AM.HasBaseReg = true;
2184 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2185
2186 if (TLI->isLegalGlobalAddressingMode(AM)) {
2187 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2188 OtherMI->dump());
2189 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2190 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2191 }
2192 }
2193 AnchorList.insert(AnchorInst);
2194 return true;
2195 }
2196
2197 return false;
2198}
2199
2200void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2201 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2202 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2203 if (AddrList.front().InstClass == CI.InstClass &&
2204 AddrList.front().IsAGPR == CI.IsAGPR &&
2205 AddrList.front().hasSameBaseAddress(CI)) {
2206 AddrList.emplace_back(CI);
2207 return;
2208 }
2209 }
2210
2211 // Base address not found, so add a new list.
2212 MergeableInsts.emplace_back(1, CI);
2213}
2214
2215std::pair<MachineBasicBlock::iterator, bool>
2216SILoadStoreOptimizer::collectMergeableInsts(
2218 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2219 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2220 bool Modified = false;
2221
2222 // Sort potential mergeable instructions into lists. One list per base address.
2223 unsigned Order = 0;
2224 MachineBasicBlock::iterator BlockI = Begin;
2225 for (; BlockI != End; ++BlockI) {
2226 MachineInstr &MI = *BlockI;
2227
2228 // We run this before checking if an address is mergeable, because it can produce
2229 // better code even if the instructions aren't mergeable.
2230 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2231 Modified = true;
2232
2233 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2234 // barriers. We can look after this barrier for separate merges.
2235 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2236 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2237
2238 // Search will resume after this instruction in a separate merge list.
2239 ++BlockI;
2240 break;
2241 }
2242
2243 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2244 if (InstClass == UNKNOWN)
2245 continue;
2246
2247 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2248 int Swizzled =
2249 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2250 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2251 continue;
2252
2253 CombineInfo CI;
2254 CI.setMI(MI, *this);
2255 CI.Order = Order++;
2256
2257 if (!CI.hasMergeableAddress(*MRI))
2258 continue;
2259
2260 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2261 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2262 // operands. However we are reporting that ds_write2 shall have
2263 // only VGPR data so that machine copy propagation does not
2264 // create an illegal instruction with a VGPR and AGPR sources.
2265 // Consequenctially if we create such instruction the verifier
2266 // will complain.
2267 continue;
2268 }
2269
2270 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2271
2272 addInstToMergeableList(CI, MergeableInsts);
2273 }
2274
2275 // At this point we have lists of Mergeable instructions.
2276 //
2277 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2278 // list try to find an instruction that can be merged with I. If an instruction
2279 // is found, it is stored in the Paired field. If no instructions are found, then
2280 // the CombineInfo object is deleted from the list.
2281
2282 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2283 E = MergeableInsts.end(); I != E;) {
2284
2285 std::list<CombineInfo> &MergeList = *I;
2286 if (MergeList.size() <= 1) {
2287 // This means we have found only one instruction with a given address
2288 // that can be merged, and we need at least 2 instructions to do a merge,
2289 // so this list can be discarded.
2290 I = MergeableInsts.erase(I);
2291 continue;
2292 }
2293
2294 // Sort the lists by offsets, this way mergeable instructions will be
2295 // adjacent to each other in the list, which will make it easier to find
2296 // matches.
2297 MergeList.sort(
2298 [] (const CombineInfo &A, const CombineInfo &B) {
2299 return A.Offset < B.Offset;
2300 });
2301 ++I;
2302 }
2303
2304 return {BlockI, Modified};
2305}
2306
2307// Scan through looking for adjacent LDS operations with constant offsets from
2308// the same base register. We rely on the scheduler to do the hard work of
2309// clustering nearby loads, and assume these are all adjacent.
2310bool SILoadStoreOptimizer::optimizeBlock(
2311 std::list<std::list<CombineInfo> > &MergeableInsts) {
2312 bool Modified = false;
2313
2314 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2315 E = MergeableInsts.end(); I != E;) {
2316 std::list<CombineInfo> &MergeList = *I;
2317
2318 bool OptimizeListAgain = false;
2319 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2320 // We weren't able to make any changes, so delete the list so we don't
2321 // process the same instructions the next time we try to optimize this
2322 // block.
2323 I = MergeableInsts.erase(I);
2324 continue;
2325 }
2326
2327 Modified = true;
2328
2329 // We made changes, but also determined that there were no more optimization
2330 // opportunities, so we don't need to reprocess the list
2331 if (!OptimizeListAgain) {
2332 I = MergeableInsts.erase(I);
2333 continue;
2334 }
2335 OptimizeAgain = true;
2336 }
2337 return Modified;
2338}
2339
2340bool
2341SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2342 std::list<CombineInfo> &MergeList,
2343 bool &OptimizeListAgain) {
2344 if (MergeList.empty())
2345 return false;
2346
2347 bool Modified = false;
2348
2349 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2350 Next = std::next(I)) {
2351
2352 auto First = I;
2353 auto Second = Next;
2354
2355 if ((*First).Order > (*Second).Order)
2356 std::swap(First, Second);
2357 CombineInfo &CI = *First;
2358 CombineInfo &Paired = *Second;
2359
2360 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2361 if (!Where) {
2362 ++I;
2363 continue;
2364 }
2365
2366 Modified = true;
2367
2368 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2369
2371 switch (CI.InstClass) {
2372 default:
2373 llvm_unreachable("unknown InstClass");
2374 break;
2375 case DS_READ:
2376 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2377 break;
2378 case DS_WRITE:
2379 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2380 break;
2381 case S_BUFFER_LOAD_IMM:
2382 case S_BUFFER_LOAD_SGPR_IMM:
2383 case S_LOAD_IMM:
2384 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2385 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2386 break;
2387 case BUFFER_LOAD:
2388 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2389 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2390 break;
2391 case BUFFER_STORE:
2392 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2393 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2394 break;
2395 case MIMG:
2396 NewMI = mergeImagePair(CI, Paired, Where->I);
2397 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2398 break;
2399 case TBUFFER_LOAD:
2400 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2401 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2402 break;
2403 case TBUFFER_STORE:
2404 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2405 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2406 break;
2407 case FLAT_LOAD:
2408 case GLOBAL_LOAD:
2409 case GLOBAL_LOAD_SADDR:
2410 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2411 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2412 break;
2413 case FLAT_STORE:
2414 case GLOBAL_STORE:
2415 case GLOBAL_STORE_SADDR:
2416 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2417 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2418 break;
2419 }
2420 CI.setMI(NewMI, *this);
2421 CI.Order = Where->Order;
2422 if (I == Second)
2423 I = Next;
2424
2425 MergeList.erase(Second);
2426 }
2427
2428 return Modified;
2429}
2430
2431bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2432 if (skipFunction(MF.getFunction()))
2433 return false;
2434
2435 STM = &MF.getSubtarget<GCNSubtarget>();
2436 if (!STM->loadStoreOptEnabled())
2437 return false;
2438
2439 TII = STM->getInstrInfo();
2440 TRI = &TII->getRegisterInfo();
2441
2442 MRI = &MF.getRegInfo();
2443 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2444
2445 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2446
2447 bool Modified = false;
2448
2449 // Contains the list of instructions for which constant offsets are being
2450 // promoted to the IMM. This is tracked for an entire block at time.
2452 MemInfoMap Visited;
2453
2454 for (MachineBasicBlock &MBB : MF) {
2455 MachineBasicBlock::iterator SectionEnd;
2456 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2457 I = SectionEnd) {
2458 bool CollectModified;
2459 std::list<std::list<CombineInfo>> MergeableInsts;
2460
2461 // First pass: Collect list of all instructions we know how to merge in a
2462 // subset of the block.
2463 std::tie(SectionEnd, CollectModified) =
2464 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2465
2466 Modified |= CollectModified;
2467
2468 do {
2469 OptimizeAgain = false;
2470 Modified |= optimizeBlock(MergeableInsts);
2471 } while (OptimizeAgain);
2472 }
2473
2474 Visited.clear();
2475 AnchorList.clear();
2476 }
2477
2478 return Modified;
2479}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1291
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
#define op(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
SI Load Store Optimizer
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
#define DEBUG_TYPE
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition: APInt.h:76
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:253
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:261
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:692
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:954
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:558
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
void dump() const
Definition: Pass.cpp:136
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:361
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...