LLVM 18.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "AMDGPU.h"
61#include "GCNSubtarget.h"
66
67using namespace llvm;
68
69#define DEBUG_TYPE "si-load-store-opt"
70
71namespace {
72enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 S_BUFFER_LOAD_SGPR_IMM,
78 S_LOAD_IMM,
79 BUFFER_LOAD,
80 BUFFER_STORE,
81 MIMG,
82 TBUFFER_LOAD,
83 TBUFFER_STORE,
84 GLOBAL_LOAD_SADDR,
85 GLOBAL_STORE_SADDR,
86 FLAT_LOAD,
87 FLAT_STORE,
88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89 GLOBAL_STORE // any CombineInfo, they are only ever returned by
90 // getCommonInstClass.
91};
92
93struct AddressRegs {
94 unsigned char NumVAddrs = 0;
95 bool SBase = false;
96 bool SRsrc = false;
97 bool SOffset = false;
98 bool SAddr = false;
99 bool VAddr = false;
100 bool Addr = false;
101 bool SSamp = false;
102};
103
104// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105const unsigned MaxAddressRegs = 12 + 1 + 1;
106
107class SILoadStoreOptimizer : public MachineFunctionPass {
108 struct CombineInfo {
110 unsigned EltSize;
111 unsigned Offset;
112 unsigned Width;
113 unsigned Format;
114 unsigned BaseOff;
115 unsigned DMask;
116 InstClassEnum InstClass;
117 unsigned CPol = 0;
118 bool IsAGPR;
119 bool UseST64;
120 int AddrIdx[MaxAddressRegs];
121 const MachineOperand *AddrReg[MaxAddressRegs];
122 unsigned NumAddresses;
123 unsigned Order;
124
125 bool hasSameBaseAddress(const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
127 return false;
128
129 const MachineInstr &MI = *CI.I;
130 for (unsigned i = 0; i < NumAddresses; i++) {
131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132
133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136 return false;
137 }
138 continue;
139 }
140
141 // Check same base pointer. Be careful of subregisters, which can occur
142 // with vectors of pointers.
143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145 return false;
146 }
147 }
148 return true;
149 }
150
151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152 for (unsigned i = 0; i < NumAddresses; ++i) {
153 const MachineOperand *AddrOp = AddrReg[i];
154 // Immediates are always OK.
155 if (AddrOp->isImm())
156 continue;
157
158 // Don't try to merge addresses that aren't either immediates or registers.
159 // TODO: Should be possible to merge FrameIndexes and maybe some other
160 // non-register
161 if (!AddrOp->isReg())
162 return false;
163
164 // TODO: We should be able to merge physical reg addresses.
165 if (AddrOp->getReg().isPhysical())
166 return false;
167
168 // If an address has only one use then there will be no other
169 // instructions with the same address, so we can't merge this one.
170 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
171 return false;
172 }
173 return true;
174 }
175
176 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
177
178 // Compare by pointer order.
179 bool operator<(const CombineInfo& Other) const {
180 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
181 }
182 };
183
184 struct BaseRegisters {
185 Register LoReg;
186 Register HiReg;
187
188 unsigned LoSubReg = 0;
189 unsigned HiSubReg = 0;
190 };
191
192 struct MemAddress {
193 BaseRegisters Base;
194 int64_t Offset = 0;
195 };
196
197 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
198
199private:
200 const GCNSubtarget *STM = nullptr;
201 const SIInstrInfo *TII = nullptr;
202 const SIRegisterInfo *TRI = nullptr;
203 MachineRegisterInfo *MRI = nullptr;
204 AliasAnalysis *AA = nullptr;
205 bool OptimizeAgain;
206
207 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
208 const DenseSet<Register> &ARegUses,
209 const MachineInstr &A, const MachineInstr &B) const;
210 static bool dmasksCanBeCombined(const CombineInfo &CI,
211 const SIInstrInfo &TII,
212 const CombineInfo &Paired);
213 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
214 CombineInfo &Paired, bool Modify = false);
215 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
216 const CombineInfo &Paired);
217 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
218 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
219 const CombineInfo &Paired);
220 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
221 const CombineInfo &Paired);
222 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
223
224 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
225
226 unsigned read2Opcode(unsigned EltSize) const;
227 unsigned read2ST64Opcode(unsigned EltSize) const;
229 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
230 MachineBasicBlock::iterator InsertBefore);
231
232 unsigned write2Opcode(unsigned EltSize) const;
233 unsigned write2ST64Opcode(unsigned EltSize) const;
235 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
236 MachineBasicBlock::iterator InsertBefore);
238 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
239 MachineBasicBlock::iterator InsertBefore);
241 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
242 MachineBasicBlock::iterator InsertBefore);
244 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245 MachineBasicBlock::iterator InsertBefore);
247 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248 MachineBasicBlock::iterator InsertBefore);
250 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
251 MachineBasicBlock::iterator InsertBefore);
253 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
254 MachineBasicBlock::iterator InsertBefore);
256 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
257 MachineBasicBlock::iterator InsertBefore);
259 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
260 MachineBasicBlock::iterator InsertBefore);
261
262 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
263 int32_t NewOffset) const;
264 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
265 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
266 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
267 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
268 /// Promotes constant offset to the immediate by adjusting the base. It
269 /// tries to use a base from the nearby instructions that allows it to have
270 /// a 13bit constant offset which gets promoted to the immediate.
271 bool promoteConstantOffsetToImm(MachineInstr &CI,
272 MemInfoMap &Visited,
273 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
274 void addInstToMergeableList(const CombineInfo &CI,
275 std::list<std::list<CombineInfo> > &MergeableInsts) const;
276
277 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
279 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
280 std::list<std::list<CombineInfo>> &MergeableInsts) const;
281
282 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
283 const CombineInfo &Paired);
284
285 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
286 const CombineInfo &Paired);
287
288public:
289 static char ID;
290
291 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
293 }
294
295 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
296 bool &OptimizeListAgain);
297 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
298
299 bool runOnMachineFunction(MachineFunction &MF) override;
300
301 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
302
303 void getAnalysisUsage(AnalysisUsage &AU) const override {
304 AU.setPreservesCFG();
306
308 }
309
310 MachineFunctionProperties getRequiredProperties() const override {
312 .set(MachineFunctionProperties::Property::IsSSA);
313 }
314};
315
316static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
317 const unsigned Opc = MI.getOpcode();
318
319 if (TII.isMUBUF(Opc)) {
320 // FIXME: Handle d16 correctly
321 return AMDGPU::getMUBUFElements(Opc);
322 }
323 if (TII.isMIMG(MI)) {
324 uint64_t DMaskImm =
325 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
326 return llvm::popcount(DMaskImm);
327 }
328 if (TII.isMTBUF(Opc)) {
329 return AMDGPU::getMTBUFElements(Opc);
330 }
331
332 switch (Opc) {
333 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
334 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
335 case AMDGPU::S_LOAD_DWORD_IMM:
336 case AMDGPU::GLOBAL_LOAD_DWORD:
337 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
338 case AMDGPU::GLOBAL_STORE_DWORD:
339 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
340 case AMDGPU::FLAT_LOAD_DWORD:
341 case AMDGPU::FLAT_STORE_DWORD:
342 return 1;
343 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
344 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
345 case AMDGPU::S_LOAD_DWORDX2_IMM:
346 case AMDGPU::GLOBAL_LOAD_DWORDX2:
347 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
348 case AMDGPU::GLOBAL_STORE_DWORDX2:
349 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
350 case AMDGPU::FLAT_LOAD_DWORDX2:
351 case AMDGPU::FLAT_STORE_DWORDX2:
352 return 2;
353 case AMDGPU::GLOBAL_LOAD_DWORDX3:
354 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
355 case AMDGPU::GLOBAL_STORE_DWORDX3:
356 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
357 case AMDGPU::FLAT_LOAD_DWORDX3:
358 case AMDGPU::FLAT_STORE_DWORDX3:
359 return 3;
360 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
361 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
362 case AMDGPU::S_LOAD_DWORDX4_IMM:
363 case AMDGPU::GLOBAL_LOAD_DWORDX4:
364 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
365 case AMDGPU::GLOBAL_STORE_DWORDX4:
366 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
367 case AMDGPU::FLAT_LOAD_DWORDX4:
368 case AMDGPU::FLAT_STORE_DWORDX4:
369 return 4;
370 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
372 case AMDGPU::S_LOAD_DWORDX8_IMM:
373 return 8;
374 case AMDGPU::DS_READ_B32: [[fallthrough]];
375 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
376 case AMDGPU::DS_WRITE_B32: [[fallthrough]];
377 case AMDGPU::DS_WRITE_B32_gfx9:
378 return 1;
379 case AMDGPU::DS_READ_B64: [[fallthrough]];
380 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
381 case AMDGPU::DS_WRITE_B64: [[fallthrough]];
382 case AMDGPU::DS_WRITE_B64_gfx9:
383 return 2;
384 default:
385 return 0;
386 }
387}
388
389/// Maps instruction opcode to enum InstClassEnum.
390static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
391 switch (Opc) {
392 default:
393 if (TII.isMUBUF(Opc)) {
394 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
395 default:
396 return UNKNOWN;
397 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
398 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
399 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
400 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
401 return BUFFER_LOAD;
402 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
403 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
404 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
405 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
406 return BUFFER_STORE;
407 }
408 }
409 if (TII.isMIMG(Opc)) {
410 // Ignore instructions encoded without vaddr.
411 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
412 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
413 return UNKNOWN;
414 // Ignore BVH instructions
415 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
416 return UNKNOWN;
417 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
418 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
419 TII.isGather4(Opc))
420 return UNKNOWN;
421 return MIMG;
422 }
423 if (TII.isMTBUF(Opc)) {
424 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
425 default:
426 return UNKNOWN;
427 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
428 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
429 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
430 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
431 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
432 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
433 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
434 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
435 return TBUFFER_LOAD;
436 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
437 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
438 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
439 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
440 return TBUFFER_STORE;
441 }
442 }
443 return UNKNOWN;
444 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
445 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
446 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
447 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
448 return S_BUFFER_LOAD_IMM;
449 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
450 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
451 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
452 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
453 return S_BUFFER_LOAD_SGPR_IMM;
454 case AMDGPU::S_LOAD_DWORD_IMM:
455 case AMDGPU::S_LOAD_DWORDX2_IMM:
456 case AMDGPU::S_LOAD_DWORDX4_IMM:
457 case AMDGPU::S_LOAD_DWORDX8_IMM:
458 return S_LOAD_IMM;
459 case AMDGPU::DS_READ_B32:
460 case AMDGPU::DS_READ_B32_gfx9:
461 case AMDGPU::DS_READ_B64:
462 case AMDGPU::DS_READ_B64_gfx9:
463 return DS_READ;
464 case AMDGPU::DS_WRITE_B32:
465 case AMDGPU::DS_WRITE_B32_gfx9:
466 case AMDGPU::DS_WRITE_B64:
467 case AMDGPU::DS_WRITE_B64_gfx9:
468 return DS_WRITE;
469 case AMDGPU::GLOBAL_LOAD_DWORD:
470 case AMDGPU::GLOBAL_LOAD_DWORDX2:
471 case AMDGPU::GLOBAL_LOAD_DWORDX3:
472 case AMDGPU::GLOBAL_LOAD_DWORDX4:
473 case AMDGPU::FLAT_LOAD_DWORD:
474 case AMDGPU::FLAT_LOAD_DWORDX2:
475 case AMDGPU::FLAT_LOAD_DWORDX3:
476 case AMDGPU::FLAT_LOAD_DWORDX4:
477 return FLAT_LOAD;
478 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
479 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
480 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
481 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
482 return GLOBAL_LOAD_SADDR;
483 case AMDGPU::GLOBAL_STORE_DWORD:
484 case AMDGPU::GLOBAL_STORE_DWORDX2:
485 case AMDGPU::GLOBAL_STORE_DWORDX3:
486 case AMDGPU::GLOBAL_STORE_DWORDX4:
487 case AMDGPU::FLAT_STORE_DWORD:
488 case AMDGPU::FLAT_STORE_DWORDX2:
489 case AMDGPU::FLAT_STORE_DWORDX3:
490 case AMDGPU::FLAT_STORE_DWORDX4:
491 return FLAT_STORE;
492 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
493 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
494 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
495 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
496 return GLOBAL_STORE_SADDR;
497 }
498}
499
500/// Determines instruction subclass from opcode. Only instructions
501/// of the same subclass can be merged together. The merged instruction may have
502/// a different subclass but must have the same class.
503static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
504 switch (Opc) {
505 default:
506 if (TII.isMUBUF(Opc))
507 return AMDGPU::getMUBUFBaseOpcode(Opc);
508 if (TII.isMIMG(Opc)) {
510 assert(Info);
511 return Info->BaseOpcode;
512 }
513 if (TII.isMTBUF(Opc))
514 return AMDGPU::getMTBUFBaseOpcode(Opc);
515 return -1;
516 case AMDGPU::DS_READ_B32:
517 case AMDGPU::DS_READ_B32_gfx9:
518 case AMDGPU::DS_READ_B64:
519 case AMDGPU::DS_READ_B64_gfx9:
520 case AMDGPU::DS_WRITE_B32:
521 case AMDGPU::DS_WRITE_B32_gfx9:
522 case AMDGPU::DS_WRITE_B64:
523 case AMDGPU::DS_WRITE_B64_gfx9:
524 return Opc;
525 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
529 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
530 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
533 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
534 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
535 case AMDGPU::S_LOAD_DWORD_IMM:
536 case AMDGPU::S_LOAD_DWORDX2_IMM:
537 case AMDGPU::S_LOAD_DWORDX4_IMM:
538 case AMDGPU::S_LOAD_DWORDX8_IMM:
539 return AMDGPU::S_LOAD_DWORD_IMM;
540 case AMDGPU::GLOBAL_LOAD_DWORD:
541 case AMDGPU::GLOBAL_LOAD_DWORDX2:
542 case AMDGPU::GLOBAL_LOAD_DWORDX3:
543 case AMDGPU::GLOBAL_LOAD_DWORDX4:
544 case AMDGPU::FLAT_LOAD_DWORD:
545 case AMDGPU::FLAT_LOAD_DWORDX2:
546 case AMDGPU::FLAT_LOAD_DWORDX3:
547 case AMDGPU::FLAT_LOAD_DWORDX4:
548 return AMDGPU::FLAT_LOAD_DWORD;
549 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
550 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
551 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
552 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
553 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
554 case AMDGPU::GLOBAL_STORE_DWORD:
555 case AMDGPU::GLOBAL_STORE_DWORDX2:
556 case AMDGPU::GLOBAL_STORE_DWORDX3:
557 case AMDGPU::GLOBAL_STORE_DWORDX4:
558 case AMDGPU::FLAT_STORE_DWORD:
559 case AMDGPU::FLAT_STORE_DWORDX2:
560 case AMDGPU::FLAT_STORE_DWORDX3:
561 case AMDGPU::FLAT_STORE_DWORDX4:
562 return AMDGPU::FLAT_STORE_DWORD;
563 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
564 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
565 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
566 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
567 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
568 }
569}
570
571// GLOBAL loads and stores are classified as FLAT initially. If both combined
572// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
573// If either or both instructions are non segment specific FLAT the resulting
574// combined operation will be FLAT, potentially promoting one of the GLOBAL
575// operations to FLAT.
576// For other instructions return the original unmodified class.
577InstClassEnum
578SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
579 const CombineInfo &Paired) {
580 assert(CI.InstClass == Paired.InstClass);
581
582 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
584 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
585
586 return CI.InstClass;
587}
588
589static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
590 AddressRegs Result;
591
592 if (TII.isMUBUF(Opc)) {
594 Result.VAddr = true;
596 Result.SRsrc = true;
598 Result.SOffset = true;
599
600 return Result;
601 }
602
603 if (TII.isMIMG(Opc)) {
604 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
605 if (VAddr0Idx >= 0) {
606 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
607 Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
608 } else {
609 Result.VAddr = true;
610 }
611 Result.SRsrc = true;
613 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
614 Result.SSamp = true;
615
616 return Result;
617 }
618 if (TII.isMTBUF(Opc)) {
620 Result.VAddr = true;
622 Result.SRsrc = true;
624 Result.SOffset = true;
625
626 return Result;
627 }
628
629 switch (Opc) {
630 default:
631 return Result;
632 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
633 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
634 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
635 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
636 Result.SOffset = true;
637 [[fallthrough]];
638 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
639 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
640 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
642 case AMDGPU::S_LOAD_DWORD_IMM:
643 case AMDGPU::S_LOAD_DWORDX2_IMM:
644 case AMDGPU::S_LOAD_DWORDX4_IMM:
645 case AMDGPU::S_LOAD_DWORDX8_IMM:
646 Result.SBase = true;
647 return Result;
648 case AMDGPU::DS_READ_B32:
649 case AMDGPU::DS_READ_B64:
650 case AMDGPU::DS_READ_B32_gfx9:
651 case AMDGPU::DS_READ_B64_gfx9:
652 case AMDGPU::DS_WRITE_B32:
653 case AMDGPU::DS_WRITE_B64:
654 case AMDGPU::DS_WRITE_B32_gfx9:
655 case AMDGPU::DS_WRITE_B64_gfx9:
656 Result.Addr = true;
657 return Result;
658 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
659 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
660 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
661 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
662 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
663 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
664 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
665 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
666 Result.SAddr = true;
667 [[fallthrough]];
668 case AMDGPU::GLOBAL_LOAD_DWORD:
669 case AMDGPU::GLOBAL_LOAD_DWORDX2:
670 case AMDGPU::GLOBAL_LOAD_DWORDX3:
671 case AMDGPU::GLOBAL_LOAD_DWORDX4:
672 case AMDGPU::GLOBAL_STORE_DWORD:
673 case AMDGPU::GLOBAL_STORE_DWORDX2:
674 case AMDGPU::GLOBAL_STORE_DWORDX3:
675 case AMDGPU::GLOBAL_STORE_DWORDX4:
676 case AMDGPU::FLAT_LOAD_DWORD:
677 case AMDGPU::FLAT_LOAD_DWORDX2:
678 case AMDGPU::FLAT_LOAD_DWORDX3:
679 case AMDGPU::FLAT_LOAD_DWORDX4:
680 case AMDGPU::FLAT_STORE_DWORD:
681 case AMDGPU::FLAT_STORE_DWORDX2:
682 case AMDGPU::FLAT_STORE_DWORDX3:
683 case AMDGPU::FLAT_STORE_DWORDX4:
684 Result.VAddr = true;
685 return Result;
686 }
687}
688
689void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
690 const SILoadStoreOptimizer &LSO) {
691 I = MI;
692 unsigned Opc = MI->getOpcode();
693 InstClass = getInstClass(Opc, *LSO.TII);
694
695 if (InstClass == UNKNOWN)
696 return;
697
698 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
699
700 switch (InstClass) {
701 case DS_READ:
702 EltSize =
703 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
704 : 4;
705 break;
706 case DS_WRITE:
707 EltSize =
708 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
709 : 4;
710 break;
711 case S_BUFFER_LOAD_IMM:
712 case S_BUFFER_LOAD_SGPR_IMM:
713 case S_LOAD_IMM:
714 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
715 break;
716 default:
717 EltSize = 4;
718 break;
719 }
720
721 if (InstClass == MIMG) {
722 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
723 // Offset is not considered for MIMG instructions.
724 Offset = 0;
725 } else {
726 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
727 Offset = I->getOperand(OffsetIdx).getImm();
728 }
729
730 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
731 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
732
733 Width = getOpcodeWidth(*I, *LSO.TII);
734
735 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
736 Offset &= 0xffff;
737 } else if (InstClass != MIMG) {
738 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
739 }
740
741 AddressRegs Regs = getRegs(Opc, *LSO.TII);
742
743 NumAddresses = 0;
744 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
745 AddrIdx[NumAddresses++] =
746 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
747 if (Regs.Addr)
748 AddrIdx[NumAddresses++] =
749 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
750 if (Regs.SBase)
751 AddrIdx[NumAddresses++] =
752 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
753 if (Regs.SRsrc)
754 AddrIdx[NumAddresses++] =
755 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
756 if (Regs.SOffset)
757 AddrIdx[NumAddresses++] =
758 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
759 if (Regs.SAddr)
760 AddrIdx[NumAddresses++] =
761 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
762 if (Regs.VAddr)
763 AddrIdx[NumAddresses++] =
764 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
765 if (Regs.SSamp)
766 AddrIdx[NumAddresses++] =
767 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
768 assert(NumAddresses <= MaxAddressRegs);
769
770 for (unsigned J = 0; J < NumAddresses; J++)
771 AddrReg[J] = &I->getOperand(AddrIdx[J]);
772}
773
774} // end anonymous namespace.
775
776INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
777 "SI Load Store Optimizer", false, false)
779INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
781
782char SILoadStoreOptimizer::ID = 0;
783
784char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
785
787 return new SILoadStoreOptimizer();
788}
789
791 DenseSet<Register> &RegDefs,
792 DenseSet<Register> &RegUses) {
793 for (const auto &Op : MI.operands()) {
794 if (!Op.isReg())
795 continue;
796 if (Op.isDef())
797 RegDefs.insert(Op.getReg());
798 if (Op.readsReg())
799 RegUses.insert(Op.getReg());
800 }
801}
802
803bool SILoadStoreOptimizer::canSwapInstructions(
804 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
805 const MachineInstr &A, const MachineInstr &B) const {
806 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
807 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
808 return false;
809 for (const auto &BOp : B.operands()) {
810 if (!BOp.isReg())
811 continue;
812 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
813 return false;
814 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
815 return false;
816 }
817 return true;
818}
819
820// Given that \p CI and \p Paired are adjacent memory operations produce a new
821// MMO for the combined operation with a new access size.
823SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
824 const CombineInfo &Paired) {
825 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
826 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
827
828 unsigned Size = MMOa->getSize() + MMOb->getSize();
829
830 // A base pointer for the combined operation is the same as the leading
831 // operation's pointer.
832 if (Paired < CI)
833 std::swap(MMOa, MMOb);
834
835 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
836 // If merging FLAT and GLOBAL set address space to FLAT.
838 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
839
840 MachineFunction *MF = CI.I->getMF();
841 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
842}
843
844bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
845 const SIInstrInfo &TII,
846 const CombineInfo &Paired) {
847 assert(CI.InstClass == MIMG);
848
849 // Ignore instructions with tfe/lwe set.
850 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
851 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
852
853 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
854 return false;
855
856 // Check other optional immediate operands for equality.
857 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
858 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
859 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
860
861 for (auto op : OperandsToMatch) {
862 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
863 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
864 return false;
865 if (Idx != -1 &&
866 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
867 return false;
868 }
869
870 // Check DMask for overlaps.
871 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
872 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
873
874 if (!MaxMask)
875 return false;
876
877 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
878 if ((1u << AllowedBitsForMin) <= MinMask)
879 return false;
880
881 return true;
882}
883
884static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
885 unsigned ComponentCount,
886 const GCNSubtarget &STI) {
887 if (ComponentCount > 4)
888 return 0;
889
890 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
892 if (!OldFormatInfo)
893 return 0;
894
895 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
897 ComponentCount,
898 OldFormatInfo->NumFormat, STI);
899
900 if (!NewFormatInfo)
901 return 0;
902
903 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
904 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
905
906 return NewFormatInfo->Format;
907}
908
909// Return the value in the inclusive range [Lo,Hi] that is aligned to the
910// highest power of two. Note that the result is well defined for all inputs
911// including corner cases like:
912// - if Lo == Hi, return that value
913// - if Lo == 0, return 0 (even though the "- 1" below underflows
914// - if Lo > Hi, return 0 (as if the range wrapped around)
916 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
917}
918
919bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
920 const GCNSubtarget &STI,
921 CombineInfo &Paired,
922 bool Modify) {
923 assert(CI.InstClass != MIMG);
924
925 // XXX - Would the same offset be OK? Is there any reason this would happen or
926 // be useful?
927 if (CI.Offset == Paired.Offset)
928 return false;
929
930 // This won't be valid if the offset isn't aligned.
931 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
932 return false;
933
934 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
935
938 if (!Info0)
939 return false;
941 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
942 if (!Info1)
943 return false;
944
945 if (Info0->BitsPerComp != Info1->BitsPerComp ||
946 Info0->NumFormat != Info1->NumFormat)
947 return false;
948
949 // TODO: Should be possible to support more formats, but if format loads
950 // are not dword-aligned, the merged load might not be valid.
951 if (Info0->BitsPerComp != 32)
952 return false;
953
954 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
955 return false;
956 }
957
958 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
959 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
960 CI.UseST64 = false;
961 CI.BaseOff = 0;
962
963 // Handle all non-DS instructions.
964 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
965 if (EltOffset0 + CI.Width != EltOffset1 &&
966 EltOffset1 + Paired.Width != EltOffset0)
967 return false;
968 if (CI.CPol != Paired.CPol)
969 return false;
970 return true;
971 }
972
973 // If the offset in elements doesn't fit in 8-bits, we might be able to use
974 // the stride 64 versions.
975 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
976 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
977 if (Modify) {
978 CI.Offset = EltOffset0 / 64;
979 Paired.Offset = EltOffset1 / 64;
980 CI.UseST64 = true;
981 }
982 return true;
983 }
984
985 // Check if the new offsets fit in the reduced 8-bit range.
986 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
987 if (Modify) {
988 CI.Offset = EltOffset0;
989 Paired.Offset = EltOffset1;
990 }
991 return true;
992 }
993
994 // Try to shift base address to decrease offsets.
995 uint32_t Min = std::min(EltOffset0, EltOffset1);
996 uint32_t Max = std::max(EltOffset0, EltOffset1);
997
998 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
999 if (((Max - Min) & ~Mask) == 0) {
1000 if (Modify) {
1001 // From the range of values we could use for BaseOff, choose the one that
1002 // is aligned to the highest power of two, to maximise the chance that
1003 // the same offset can be reused for other load/store pairs.
1004 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1005 // Copy the low bits of the offsets, so that when we adjust them by
1006 // subtracting BaseOff they will be multiples of 64.
1007 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1008 CI.BaseOff = BaseOff * CI.EltSize;
1009 CI.Offset = (EltOffset0 - BaseOff) / 64;
1010 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1011 CI.UseST64 = true;
1012 }
1013 return true;
1014 }
1015
1016 if (isUInt<8>(Max - Min)) {
1017 if (Modify) {
1018 // From the range of values we could use for BaseOff, choose the one that
1019 // is aligned to the highest power of two, to maximise the chance that
1020 // the same offset can be reused for other load/store pairs.
1021 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1022 CI.BaseOff = BaseOff * CI.EltSize;
1023 CI.Offset = EltOffset0 - BaseOff;
1024 Paired.Offset = EltOffset1 - BaseOff;
1025 }
1026 return true;
1027 }
1028
1029 return false;
1030}
1031
1032bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1033 const CombineInfo &CI,
1034 const CombineInfo &Paired) {
1035 const unsigned Width = (CI.Width + Paired.Width);
1036 switch (CI.InstClass) {
1037 default:
1038 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1039 case S_BUFFER_LOAD_IMM:
1040 case S_BUFFER_LOAD_SGPR_IMM:
1041 case S_LOAD_IMM:
1042 switch (Width) {
1043 default:
1044 return false;
1045 case 2:
1046 case 4:
1047 case 8:
1048 return true;
1049 }
1050 }
1051}
1052
1053const TargetRegisterClass *
1054SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1055 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1056 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1057 }
1058 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1059 return TRI->getRegClassForReg(*MRI, Src->getReg());
1060 }
1061 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1062 return TRI->getRegClassForReg(*MRI, Src->getReg());
1063 }
1064 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1065 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1066 }
1067 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1068 return TRI->getRegClassForReg(*MRI, Src->getReg());
1069 }
1070 return nullptr;
1071}
1072
1073/// This function assumes that CI comes before Paired in a basic block. Return
1074/// an insertion point for the merged instruction or nullptr on failure.
1075SILoadStoreOptimizer::CombineInfo *
1076SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1077 CombineInfo &Paired) {
1078 // If another instruction has already been merged into CI, it may now be a
1079 // type that we can't do any further merging into.
1080 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1081 return nullptr;
1082 assert(CI.InstClass == Paired.InstClass);
1083
1084 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1085 getInstSubclass(Paired.I->getOpcode(), *TII))
1086 return nullptr;
1087
1088 // Check both offsets (or masks for MIMG) can be combined and fit in the
1089 // reduced range.
1090 if (CI.InstClass == MIMG) {
1091 if (!dmasksCanBeCombined(CI, *TII, Paired))
1092 return nullptr;
1093 } else {
1094 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1095 return nullptr;
1096 }
1097
1098 DenseSet<Register> RegDefs;
1099 DenseSet<Register> RegUses;
1100 CombineInfo *Where;
1101 if (CI.I->mayLoad()) {
1102 // Try to hoist Paired up to CI.
1103 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1104 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1105 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1106 return nullptr;
1107 }
1108 Where = &CI;
1109 } else {
1110 // Try to sink CI down to Paired.
1111 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1112 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1113 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1114 return nullptr;
1115 }
1116 Where = &Paired;
1117 }
1118
1119 // Call offsetsCanBeCombined with modify = true so that the offsets are
1120 // correct for the new instruction. This should return true, because
1121 // this function should only be called on CombineInfo objects that
1122 // have already been confirmed to be mergeable.
1123 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1124 offsetsCanBeCombined(CI, *STM, Paired, true);
1125 return Where;
1126}
1127
1128unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1129 if (STM->ldsRequiresM0Init())
1130 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1131 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1132}
1133
1134unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1135 if (STM->ldsRequiresM0Init())
1136 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1137
1138 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1139 : AMDGPU::DS_READ2ST64_B64_gfx9;
1140}
1141
1143SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1144 MachineBasicBlock::iterator InsertBefore) {
1145 MachineBasicBlock *MBB = CI.I->getParent();
1146
1147 // Be careful, since the addresses could be subregisters themselves in weird
1148 // cases, like vectors of pointers.
1149 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1150
1151 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1152 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1153
1154 unsigned NewOffset0 = CI.Offset;
1155 unsigned NewOffset1 = Paired.Offset;
1156 unsigned Opc =
1157 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1158
1159 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1160 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1161
1162 if (NewOffset0 > NewOffset1) {
1163 // Canonicalize the merged instruction so the smaller offset comes first.
1164 std::swap(NewOffset0, NewOffset1);
1165 std::swap(SubRegIdx0, SubRegIdx1);
1166 }
1167
1168 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1169 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1170
1171 const MCInstrDesc &Read2Desc = TII->get(Opc);
1172
1173 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1174 Register DestReg = MRI->createVirtualRegister(SuperRC);
1175
1176 DebugLoc DL = CI.I->getDebugLoc();
1177
1178 Register BaseReg = AddrReg->getReg();
1179 unsigned BaseSubReg = AddrReg->getSubReg();
1180 unsigned BaseRegFlags = 0;
1181 if (CI.BaseOff) {
1182 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1183 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1184 .addImm(CI.BaseOff);
1185
1186 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1187 BaseRegFlags = RegState::Kill;
1188
1189 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1190 .addReg(ImmReg)
1191 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1192 .addImm(0); // clamp bit
1193 BaseSubReg = 0;
1194 }
1195
1196 MachineInstrBuilder Read2 =
1197 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1198 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1199 .addImm(NewOffset0) // offset0
1200 .addImm(NewOffset1) // offset1
1201 .addImm(0) // gds
1202 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1203
1204 (void)Read2;
1205
1206 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1207
1208 // Copy to the old destination registers.
1209 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1210 .add(*Dest0) // Copy to same destination including flags and sub reg.
1211 .addReg(DestReg, 0, SubRegIdx0);
1212 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1213 .add(*Dest1)
1214 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1215
1216 CI.I->eraseFromParent();
1217 Paired.I->eraseFromParent();
1218
1219 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1220 return Read2;
1221}
1222
1223unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1224 if (STM->ldsRequiresM0Init())
1225 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1226 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1227 : AMDGPU::DS_WRITE2_B64_gfx9;
1228}
1229
1230unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1231 if (STM->ldsRequiresM0Init())
1232 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1233 : AMDGPU::DS_WRITE2ST64_B64;
1234
1235 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1236 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1237}
1238
1239MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1240 CombineInfo &CI, CombineInfo &Paired,
1241 MachineBasicBlock::iterator InsertBefore) {
1242 MachineBasicBlock *MBB = CI.I->getParent();
1243
1244 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1245 // sure we preserve the subregister index and any register flags set on them.
1246 const MachineOperand *AddrReg =
1247 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1248 const MachineOperand *Data0 =
1249 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1250 const MachineOperand *Data1 =
1251 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1252
1253 unsigned NewOffset0 = CI.Offset;
1254 unsigned NewOffset1 = Paired.Offset;
1255 unsigned Opc =
1256 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1257
1258 if (NewOffset0 > NewOffset1) {
1259 // Canonicalize the merged instruction so the smaller offset comes first.
1260 std::swap(NewOffset0, NewOffset1);
1261 std::swap(Data0, Data1);
1262 }
1263
1264 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1265 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1266
1267 const MCInstrDesc &Write2Desc = TII->get(Opc);
1268 DebugLoc DL = CI.I->getDebugLoc();
1269
1270 Register BaseReg = AddrReg->getReg();
1271 unsigned BaseSubReg = AddrReg->getSubReg();
1272 unsigned BaseRegFlags = 0;
1273 if (CI.BaseOff) {
1274 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1275 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1276 .addImm(CI.BaseOff);
1277
1278 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1279 BaseRegFlags = RegState::Kill;
1280
1281 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1282 .addReg(ImmReg)
1283 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1284 .addImm(0); // clamp bit
1285 BaseSubReg = 0;
1286 }
1287
1288 MachineInstrBuilder Write2 =
1289 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1290 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1291 .add(*Data0) // data0
1292 .add(*Data1) // data1
1293 .addImm(NewOffset0) // offset0
1294 .addImm(NewOffset1) // offset1
1295 .addImm(0) // gds
1296 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1297
1298 CI.I->eraseFromParent();
1299 Paired.I->eraseFromParent();
1300
1301 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1302 return Write2;
1303}
1304
1306SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1307 MachineBasicBlock::iterator InsertBefore) {
1308 MachineBasicBlock *MBB = CI.I->getParent();
1309 DebugLoc DL = CI.I->getDebugLoc();
1310 const unsigned Opcode = getNewOpcode(CI, Paired);
1311
1312 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1313
1314 Register DestReg = MRI->createVirtualRegister(SuperRC);
1315 unsigned MergedDMask = CI.DMask | Paired.DMask;
1316 unsigned DMaskIdx =
1317 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1318
1319 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1320 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1321 if (I == DMaskIdx)
1322 MIB.addImm(MergedDMask);
1323 else
1324 MIB.add((*CI.I).getOperand(I));
1325 }
1326
1327 // It shouldn't be possible to get this far if the two instructions
1328 // don't have a single memoperand, because MachineInstr::mayAlias()
1329 // will return true if this is the case.
1330 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1331
1332 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1333
1334 unsigned SubRegIdx0, SubRegIdx1;
1335 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1336
1337 // Copy to the old destination registers.
1338 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1339 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1340 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1341
1342 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1343 .add(*Dest0) // Copy to same destination including flags and sub reg.
1344 .addReg(DestReg, 0, SubRegIdx0);
1345 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1346 .add(*Dest1)
1347 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1348
1349 CI.I->eraseFromParent();
1350 Paired.I->eraseFromParent();
1351 return New;
1352}
1353
1354MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1355 CombineInfo &CI, CombineInfo &Paired,
1356 MachineBasicBlock::iterator InsertBefore) {
1357 MachineBasicBlock *MBB = CI.I->getParent();
1358 DebugLoc DL = CI.I->getDebugLoc();
1359 const unsigned Opcode = getNewOpcode(CI, Paired);
1360
1361 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1362
1363 Register DestReg = MRI->createVirtualRegister(SuperRC);
1364 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1365
1366 // It shouldn't be possible to get this far if the two instructions
1367 // don't have a single memoperand, because MachineInstr::mayAlias()
1368 // will return true if this is the case.
1369 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1370
1372 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1373 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1374 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1375 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1376 New.addImm(MergedOffset);
1377 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1378
1379 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1380 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1381 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1382
1383 // Copy to the old destination registers.
1384 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1385 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1386 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1387
1388 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1389 .add(*Dest0) // Copy to same destination including flags and sub reg.
1390 .addReg(DestReg, 0, SubRegIdx0);
1391 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1392 .add(*Dest1)
1393 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1394
1395 CI.I->eraseFromParent();
1396 Paired.I->eraseFromParent();
1397 return New;
1398}
1399
1400MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1401 CombineInfo &CI, CombineInfo &Paired,
1402 MachineBasicBlock::iterator InsertBefore) {
1403 MachineBasicBlock *MBB = CI.I->getParent();
1404 DebugLoc DL = CI.I->getDebugLoc();
1405
1406 const unsigned Opcode = getNewOpcode(CI, Paired);
1407
1408 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1409
1410 // Copy to the new source register.
1411 Register DestReg = MRI->createVirtualRegister(SuperRC);
1412 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1413
1414 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1415
1416 AddressRegs Regs = getRegs(Opcode, *TII);
1417
1418 if (Regs.VAddr)
1419 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1420
1421 // It shouldn't be possible to get this far if the two instructions
1422 // don't have a single memoperand, because MachineInstr::mayAlias()
1423 // will return true if this is the case.
1424 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1425
1426 MachineInstr *New =
1427 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1428 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1429 .addImm(MergedOffset) // offset
1430 .addImm(CI.CPol) // cpol
1431 .addImm(0) // swz
1432 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1433
1434 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1435 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1436 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1437
1438 // Copy to the old destination registers.
1439 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1440 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1441 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1442
1443 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1444 .add(*Dest0) // Copy to same destination including flags and sub reg.
1445 .addReg(DestReg, 0, SubRegIdx0);
1446 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1447 .add(*Dest1)
1448 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1449
1450 CI.I->eraseFromParent();
1451 Paired.I->eraseFromParent();
1452 return New;
1453}
1454
1455MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1456 CombineInfo &CI, CombineInfo &Paired,
1457 MachineBasicBlock::iterator InsertBefore) {
1458 MachineBasicBlock *MBB = CI.I->getParent();
1459 DebugLoc DL = CI.I->getDebugLoc();
1460
1461 const unsigned Opcode = getNewOpcode(CI, Paired);
1462
1463 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1464
1465 // Copy to the new source register.
1466 Register DestReg = MRI->createVirtualRegister(SuperRC);
1467 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1468
1469 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1470
1471 AddressRegs Regs = getRegs(Opcode, *TII);
1472
1473 if (Regs.VAddr)
1474 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1475
1476 unsigned JoinedFormat =
1477 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1478
1479 // It shouldn't be possible to get this far if the two instructions
1480 // don't have a single memoperand, because MachineInstr::mayAlias()
1481 // will return true if this is the case.
1482 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1483
1484 MachineInstr *New =
1485 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1486 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1487 .addImm(MergedOffset) // offset
1488 .addImm(JoinedFormat) // format
1489 .addImm(CI.CPol) // cpol
1490 .addImm(0) // swz
1491 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1492
1493 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1494 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1495 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1496
1497 // Copy to the old destination registers.
1498 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1499 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1500 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1501
1502 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1503 .add(*Dest0) // Copy to same destination including flags and sub reg.
1504 .addReg(DestReg, 0, SubRegIdx0);
1505 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1506 .add(*Dest1)
1507 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1508
1509 CI.I->eraseFromParent();
1510 Paired.I->eraseFromParent();
1511 return New;
1512}
1513
1514MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1515 CombineInfo &CI, CombineInfo &Paired,
1516 MachineBasicBlock::iterator InsertBefore) {
1517 MachineBasicBlock *MBB = CI.I->getParent();
1518 DebugLoc DL = CI.I->getDebugLoc();
1519
1520 const unsigned Opcode = getNewOpcode(CI, Paired);
1521
1522 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1523 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1524 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1525
1526 // Copy to the new source register.
1527 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1528 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1529
1530 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1531 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1532
1533 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1534 .add(*Src0)
1535 .addImm(SubRegIdx0)
1536 .add(*Src1)
1537 .addImm(SubRegIdx1);
1538
1539 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1540 .addReg(SrcReg, RegState::Kill);
1541
1542 AddressRegs Regs = getRegs(Opcode, *TII);
1543
1544 if (Regs.VAddr)
1545 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1546
1547 unsigned JoinedFormat =
1548 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1549
1550 // It shouldn't be possible to get this far if the two instructions
1551 // don't have a single memoperand, because MachineInstr::mayAlias()
1552 // will return true if this is the case.
1553 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1554
1555 MachineInstr *New =
1556 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1557 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1558 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1559 .addImm(JoinedFormat) // format
1560 .addImm(CI.CPol) // cpol
1561 .addImm(0) // swz
1562 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1563
1564 CI.I->eraseFromParent();
1565 Paired.I->eraseFromParent();
1566 return New;
1567}
1568
1569MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1570 CombineInfo &CI, CombineInfo &Paired,
1571 MachineBasicBlock::iterator InsertBefore) {
1572 MachineBasicBlock *MBB = CI.I->getParent();
1573 DebugLoc DL = CI.I->getDebugLoc();
1574
1575 const unsigned Opcode = getNewOpcode(CI, Paired);
1576
1577 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1578 Register DestReg = MRI->createVirtualRegister(SuperRC);
1579
1580 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1581
1582 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1583 MIB.add(*SAddr);
1584
1585 MachineInstr *New =
1586 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1587 .addImm(std::min(CI.Offset, Paired.Offset))
1588 .addImm(CI.CPol)
1589 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1590
1591 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1592 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1593 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1594
1595 // Copy to the old destination registers.
1596 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1597 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1598 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1599
1600 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1601 .add(*Dest0) // Copy to same destination including flags and sub reg.
1602 .addReg(DestReg, 0, SubRegIdx0);
1603 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1604 .add(*Dest1)
1605 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1606
1607 CI.I->eraseFromParent();
1608 Paired.I->eraseFromParent();
1609 return New;
1610}
1611
1612MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1613 CombineInfo &CI, CombineInfo &Paired,
1614 MachineBasicBlock::iterator InsertBefore) {
1615 MachineBasicBlock *MBB = CI.I->getParent();
1616 DebugLoc DL = CI.I->getDebugLoc();
1617
1618 const unsigned Opcode = getNewOpcode(CI, Paired);
1619
1620 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1621 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1622 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1623
1624 // Copy to the new source register.
1625 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1626 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1627
1628 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1629 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1630
1631 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1632 .add(*Src0)
1633 .addImm(SubRegIdx0)
1634 .add(*Src1)
1635 .addImm(SubRegIdx1);
1636
1637 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1638 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1639 .addReg(SrcReg, RegState::Kill);
1640
1641 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1642 MIB.add(*SAddr);
1643
1644 MachineInstr *New =
1645 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1646 .addImm(CI.CPol)
1647 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1648
1649 CI.I->eraseFromParent();
1650 Paired.I->eraseFromParent();
1651 return New;
1652}
1653
1654unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1655 const CombineInfo &Paired) {
1656 const unsigned Width = CI.Width + Paired.Width;
1657
1658 switch (getCommonInstClass(CI, Paired)) {
1659 default:
1660 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1661 // FIXME: Handle d16 correctly
1662 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1663 Width);
1664 case TBUFFER_LOAD:
1665 case TBUFFER_STORE:
1666 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1667 Width);
1668
1669 case UNKNOWN:
1670 llvm_unreachable("Unknown instruction class");
1671 case S_BUFFER_LOAD_IMM:
1672 switch (Width) {
1673 default:
1674 return 0;
1675 case 2:
1676 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1677 case 4:
1678 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1679 case 8:
1680 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1681 }
1682 case S_BUFFER_LOAD_SGPR_IMM:
1683 switch (Width) {
1684 default:
1685 return 0;
1686 case 2:
1687 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1688 case 4:
1689 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1690 case 8:
1691 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1692 }
1693 case S_LOAD_IMM:
1694 switch (Width) {
1695 default:
1696 return 0;
1697 case 2:
1698 return AMDGPU::S_LOAD_DWORDX2_IMM;
1699 case 4:
1700 return AMDGPU::S_LOAD_DWORDX4_IMM;
1701 case 8:
1702 return AMDGPU::S_LOAD_DWORDX8_IMM;
1703 }
1704 case GLOBAL_LOAD:
1705 switch (Width) {
1706 default:
1707 return 0;
1708 case 2:
1709 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1710 case 3:
1711 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1712 case 4:
1713 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1714 }
1715 case GLOBAL_LOAD_SADDR:
1716 switch (Width) {
1717 default:
1718 return 0;
1719 case 2:
1720 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1721 case 3:
1722 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1723 case 4:
1724 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1725 }
1726 case GLOBAL_STORE:
1727 switch (Width) {
1728 default:
1729 return 0;
1730 case 2:
1731 return AMDGPU::GLOBAL_STORE_DWORDX2;
1732 case 3:
1733 return AMDGPU::GLOBAL_STORE_DWORDX3;
1734 case 4:
1735 return AMDGPU::GLOBAL_STORE_DWORDX4;
1736 }
1737 case GLOBAL_STORE_SADDR:
1738 switch (Width) {
1739 default:
1740 return 0;
1741 case 2:
1742 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1743 case 3:
1744 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1745 case 4:
1746 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1747 }
1748 case FLAT_LOAD:
1749 switch (Width) {
1750 default:
1751 return 0;
1752 case 2:
1753 return AMDGPU::FLAT_LOAD_DWORDX2;
1754 case 3:
1755 return AMDGPU::FLAT_LOAD_DWORDX3;
1756 case 4:
1757 return AMDGPU::FLAT_LOAD_DWORDX4;
1758 }
1759 case FLAT_STORE:
1760 switch (Width) {
1761 default:
1762 return 0;
1763 case 2:
1764 return AMDGPU::FLAT_STORE_DWORDX2;
1765 case 3:
1766 return AMDGPU::FLAT_STORE_DWORDX3;
1767 case 4:
1768 return AMDGPU::FLAT_STORE_DWORDX4;
1769 }
1770 case MIMG:
1771 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1772 "No overlaps");
1773 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1774 }
1775}
1776
1777std::pair<unsigned, unsigned>
1778SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1779 const CombineInfo &Paired) {
1780 assert((CI.InstClass != MIMG ||
1781 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1782 CI.Width + Paired.Width)) &&
1783 "No overlaps");
1784
1785 unsigned Idx0;
1786 unsigned Idx1;
1787
1788 static const unsigned Idxs[5][4] = {
1789 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1790 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1791 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1792 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1793 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1794 };
1795
1796 assert(CI.Width >= 1 && CI.Width <= 4);
1797 assert(Paired.Width >= 1 && Paired.Width <= 4);
1798
1799 if (Paired < CI) {
1800 Idx1 = Idxs[0][Paired.Width - 1];
1801 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1802 } else {
1803 Idx0 = Idxs[0][CI.Width - 1];
1804 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1805 }
1806
1807 return std::pair(Idx0, Idx1);
1808}
1809
1810const TargetRegisterClass *
1811SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1812 const CombineInfo &Paired) {
1813 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1814 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1815 switch (CI.Width + Paired.Width) {
1816 default:
1817 return nullptr;
1818 case 2:
1819 return &AMDGPU::SReg_64_XEXECRegClass;
1820 case 4:
1821 return &AMDGPU::SGPR_128RegClass;
1822 case 8:
1823 return &AMDGPU::SGPR_256RegClass;
1824 case 16:
1825 return &AMDGPU::SGPR_512RegClass;
1826 }
1827 }
1828
1829 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1830 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1831 ? TRI->getAGPRClassForBitWidth(BitWidth)
1832 : TRI->getVGPRClassForBitWidth(BitWidth);
1833}
1834
1835MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1836 CombineInfo &CI, CombineInfo &Paired,
1837 MachineBasicBlock::iterator InsertBefore) {
1838 MachineBasicBlock *MBB = CI.I->getParent();
1839 DebugLoc DL = CI.I->getDebugLoc();
1840
1841 const unsigned Opcode = getNewOpcode(CI, Paired);
1842
1843 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1844 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1845 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1846
1847 // Copy to the new source register.
1848 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1849 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1850
1851 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1852 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1853
1854 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1855 .add(*Src0)
1856 .addImm(SubRegIdx0)
1857 .add(*Src1)
1858 .addImm(SubRegIdx1);
1859
1860 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1861 .addReg(SrcReg, RegState::Kill);
1862
1863 AddressRegs Regs = getRegs(Opcode, *TII);
1864
1865 if (Regs.VAddr)
1866 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1867
1868
1869 // It shouldn't be possible to get this far if the two instructions
1870 // don't have a single memoperand, because MachineInstr::mayAlias()
1871 // will return true if this is the case.
1872 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1873
1874 MachineInstr *New =
1875 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1876 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1877 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1878 .addImm(CI.CPol) // cpol
1879 .addImm(0) // swz
1880 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1881
1882 CI.I->eraseFromParent();
1883 Paired.I->eraseFromParent();
1884 return New;
1885}
1886
1888SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1889 APInt V(32, Val, true);
1890 if (TII->isInlineConstant(V))
1891 return MachineOperand::CreateImm(Val);
1892
1893 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1894 MachineInstr *Mov =
1895 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1896 TII->get(AMDGPU::S_MOV_B32), Reg)
1897 .addImm(Val);
1898 (void)Mov;
1899 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1900 return MachineOperand::CreateReg(Reg, false);
1901}
1902
1903// Compute base address using Addr and return the final register.
1904Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1905 const MemAddress &Addr) const {
1906 MachineBasicBlock *MBB = MI.getParent();
1907 MachineBasicBlock::iterator MBBI = MI.getIterator();
1908 DebugLoc DL = MI.getDebugLoc();
1909
1910 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1911 Addr.Base.LoSubReg) &&
1912 "Expected 32-bit Base-Register-Low!!");
1913
1914 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1915 Addr.Base.HiSubReg) &&
1916 "Expected 32-bit Base-Register-Hi!!");
1917
1918 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1919 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1920 MachineOperand OffsetHi =
1921 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1922
1923 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1924 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1925 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1926
1927 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1928 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1929 MachineInstr *LoHalf =
1930 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1931 .addReg(CarryReg, RegState::Define)
1932 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1933 .add(OffsetLo)
1934 .addImm(0); // clamp bit
1935 (void)LoHalf;
1936 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1937
1938 MachineInstr *HiHalf =
1939 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1940 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1941 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1942 .add(OffsetHi)
1943 .addReg(CarryReg, RegState::Kill)
1944 .addImm(0); // clamp bit
1945 (void)HiHalf;
1946 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1947
1948 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1949 MachineInstr *FullBase =
1950 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1951 .addReg(DestSub0)
1952 .addImm(AMDGPU::sub0)
1953 .addReg(DestSub1)
1954 .addImm(AMDGPU::sub1);
1955 (void)FullBase;
1956 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1957
1958 return FullDestReg;
1959}
1960
1961// Update base and offset with the NewBase and NewOffset in MI.
1962void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1963 Register NewBase,
1964 int32_t NewOffset) const {
1965 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1966 Base->setReg(NewBase);
1967 Base->setIsKill(false);
1968 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1969}
1970
1971std::optional<int32_t>
1972SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1973 if (Op.isImm())
1974 return Op.getImm();
1975
1976 if (!Op.isReg())
1977 return std::nullopt;
1978
1979 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1980 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1981 !Def->getOperand(1).isImm())
1982 return std::nullopt;
1983
1984 return Def->getOperand(1).getImm();
1985}
1986
1987// Analyze Base and extracts:
1988// - 32bit base registers, subregisters
1989// - 64bit constant offset
1990// Expecting base computation as:
1991// %OFFSET0:sgpr_32 = S_MOV_B32 8000
1992// %LO:vgpr_32, %c:sreg_64_xexec =
1993// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1994// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1995// %Base:vreg_64 =
1996// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1997void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1998 MemAddress &Addr) const {
1999 if (!Base.isReg())
2000 return;
2001
2002 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2003 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2004 || Def->getNumOperands() != 5)
2005 return;
2006
2007 MachineOperand BaseLo = Def->getOperand(1);
2008 MachineOperand BaseHi = Def->getOperand(3);
2009 if (!BaseLo.isReg() || !BaseHi.isReg())
2010 return;
2011
2012 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2013 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2014
2015 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2016 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2017 return;
2018
2019 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2020 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2021
2022 auto Offset0P = extractConstOffset(*Src0);
2023 if (Offset0P)
2024 BaseLo = *Src1;
2025 else {
2026 if (!(Offset0P = extractConstOffset(*Src1)))
2027 return;
2028 BaseLo = *Src0;
2029 }
2030
2031 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2032 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2033
2034 if (Src0->isImm())
2035 std::swap(Src0, Src1);
2036
2037 if (!Src1->isImm())
2038 return;
2039
2040 uint64_t Offset1 = Src1->getImm();
2041 BaseHi = *Src0;
2042
2043 Addr.Base.LoReg = BaseLo.getReg();
2044 Addr.Base.HiReg = BaseHi.getReg();
2045 Addr.Base.LoSubReg = BaseLo.getSubReg();
2046 Addr.Base.HiSubReg = BaseHi.getSubReg();
2047 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2048}
2049
2050bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2052 MemInfoMap &Visited,
2053 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2054
2055 if (!(MI.mayLoad() ^ MI.mayStore()))
2056 return false;
2057
2058 // TODO: Support flat and scratch.
2059 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2060 return false;
2061
2062 if (MI.mayLoad() &&
2063 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2064 return false;
2065
2066 if (AnchorList.count(&MI))
2067 return false;
2068
2069 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2070
2071 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2072 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2073 return false;
2074 }
2075
2076 // Step1: Find the base-registers and a 64bit constant offset.
2077 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2078 MemAddress MAddr;
2079 if (!Visited.contains(&MI)) {
2080 processBaseWithConstOffset(Base, MAddr);
2081 Visited[&MI] = MAddr;
2082 } else
2083 MAddr = Visited[&MI];
2084
2085 if (MAddr.Offset == 0) {
2086 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2087 " constant offsets that can be promoted.\n";);
2088 return false;
2089 }
2090
2091 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2092 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2093
2094 // Step2: Traverse through MI's basic block and find an anchor(that has the
2095 // same base-registers) with the highest 13bit distance from MI's offset.
2096 // E.g. (64bit loads)
2097 // bb:
2098 // addr1 = &a + 4096; load1 = load(addr1, 0)
2099 // addr2 = &a + 6144; load2 = load(addr2, 0)
2100 // addr3 = &a + 8192; load3 = load(addr3, 0)
2101 // addr4 = &a + 10240; load4 = load(addr4, 0)
2102 // addr5 = &a + 12288; load5 = load(addr5, 0)
2103 //
2104 // Starting from the first load, the optimization will try to find a new base
2105 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2106 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2107 // as the new-base(anchor) because of the maximum distance which can
2108 // accommodate more intermediate bases presumably.
2109 //
2110 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2111 // (&a + 8192) for load1, load2, load4.
2112 // addr = &a + 8192
2113 // load1 = load(addr, -4096)
2114 // load2 = load(addr, -2048)
2115 // load3 = load(addr, 0)
2116 // load4 = load(addr, 2048)
2117 // addr5 = &a + 12288; load5 = load(addr5, 0)
2118 //
2119 MachineInstr *AnchorInst = nullptr;
2120 MemAddress AnchorAddr;
2121 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2123
2124 MachineBasicBlock *MBB = MI.getParent();
2126 MachineBasicBlock::iterator MBBI = MI.getIterator();
2127 ++MBBI;
2128 const SITargetLowering *TLI =
2129 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2130
2131 for ( ; MBBI != E; ++MBBI) {
2132 MachineInstr &MINext = *MBBI;
2133 // TODO: Support finding an anchor(with same base) from store addresses or
2134 // any other load addresses where the opcodes are different.
2135 if (MINext.getOpcode() != MI.getOpcode() ||
2136 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2137 continue;
2138
2139 const MachineOperand &BaseNext =
2140 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2141 MemAddress MAddrNext;
2142 if (!Visited.contains(&MINext)) {
2143 processBaseWithConstOffset(BaseNext, MAddrNext);
2144 Visited[&MINext] = MAddrNext;
2145 } else
2146 MAddrNext = Visited[&MINext];
2147
2148 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2149 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2150 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2151 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2152 continue;
2153
2154 InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2155
2156 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2158 AM.HasBaseReg = true;
2159 AM.BaseOffs = Dist;
2160 if (TLI->isLegalGlobalAddressingMode(AM) &&
2161 (uint32_t)std::abs(Dist) > MaxDist) {
2162 MaxDist = std::abs(Dist);
2163
2164 AnchorAddr = MAddrNext;
2165 AnchorInst = &MINext;
2166 }
2167 }
2168
2169 if (AnchorInst) {
2170 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2171 AnchorInst->dump());
2172 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2173 << AnchorAddr.Offset << "\n\n");
2174
2175 // Instead of moving up, just re-compute anchor-instruction's base address.
2176 Register Base = computeBase(MI, AnchorAddr);
2177
2178 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2179 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2180
2181 for (auto P : InstsWCommonBase) {
2183 AM.HasBaseReg = true;
2184 AM.BaseOffs = P.second - AnchorAddr.Offset;
2185
2186 if (TLI->isLegalGlobalAddressingMode(AM)) {
2187 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
2188 dbgs() << ")"; P.first->dump());
2189 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2190 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
2191 }
2192 }
2193 AnchorList.insert(AnchorInst);
2194 return true;
2195 }
2196
2197 return false;
2198}
2199
2200void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2201 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2202 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2203 if (AddrList.front().InstClass == CI.InstClass &&
2204 AddrList.front().IsAGPR == CI.IsAGPR &&
2205 AddrList.front().hasSameBaseAddress(CI)) {
2206 AddrList.emplace_back(CI);
2207 return;
2208 }
2209 }
2210
2211 // Base address not found, so add a new list.
2212 MergeableInsts.emplace_back(1, CI);
2213}
2214
2215std::pair<MachineBasicBlock::iterator, bool>
2216SILoadStoreOptimizer::collectMergeableInsts(
2218 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2219 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2220 bool Modified = false;
2221
2222 // Sort potential mergeable instructions into lists. One list per base address.
2223 unsigned Order = 0;
2224 MachineBasicBlock::iterator BlockI = Begin;
2225 for (; BlockI != End; ++BlockI) {
2226 MachineInstr &MI = *BlockI;
2227
2228 // We run this before checking if an address is mergeable, because it can produce
2229 // better code even if the instructions aren't mergeable.
2230 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2231 Modified = true;
2232
2233 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2234 // barriers. We can look after this barrier for separate merges.
2235 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2236 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2237
2238 // Search will resume after this instruction in a separate merge list.
2239 ++BlockI;
2240 break;
2241 }
2242
2243 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2244 if (InstClass == UNKNOWN)
2245 continue;
2246
2247 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2248 int Swizzled =
2249 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2250 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2251 continue;
2252
2253 CombineInfo CI;
2254 CI.setMI(MI, *this);
2255 CI.Order = Order++;
2256
2257 if (!CI.hasMergeableAddress(*MRI))
2258 continue;
2259
2260 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2261 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2262 // operands. However we are reporting that ds_write2 shall have
2263 // only VGPR data so that machine copy propagation does not
2264 // create an illegal instruction with a VGPR and AGPR sources.
2265 // Consequenctially if we create such instruction the verifier
2266 // will complain.
2267 continue;
2268 }
2269
2270 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2271
2272 addInstToMergeableList(CI, MergeableInsts);
2273 }
2274
2275 // At this point we have lists of Mergeable instructions.
2276 //
2277 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2278 // list try to find an instruction that can be merged with I. If an instruction
2279 // is found, it is stored in the Paired field. If no instructions are found, then
2280 // the CombineInfo object is deleted from the list.
2281
2282 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2283 E = MergeableInsts.end(); I != E;) {
2284
2285 std::list<CombineInfo> &MergeList = *I;
2286 if (MergeList.size() <= 1) {
2287 // This means we have found only one instruction with a given address
2288 // that can be merged, and we need at least 2 instructions to do a merge,
2289 // so this list can be discarded.
2290 I = MergeableInsts.erase(I);
2291 continue;
2292 }
2293
2294 // Sort the lists by offsets, this way mergeable instructions will be
2295 // adjacent to each other in the list, which will make it easier to find
2296 // matches.
2297 MergeList.sort(
2298 [] (const CombineInfo &A, const CombineInfo &B) {
2299 return A.Offset < B.Offset;
2300 });
2301 ++I;
2302 }
2303
2304 return std::pair(BlockI, Modified);
2305}
2306
2307// Scan through looking for adjacent LDS operations with constant offsets from
2308// the same base register. We rely on the scheduler to do the hard work of
2309// clustering nearby loads, and assume these are all adjacent.
2310bool SILoadStoreOptimizer::optimizeBlock(
2311 std::list<std::list<CombineInfo> > &MergeableInsts) {
2312 bool Modified = false;
2313
2314 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2315 E = MergeableInsts.end(); I != E;) {
2316 std::list<CombineInfo> &MergeList = *I;
2317
2318 bool OptimizeListAgain = false;
2319 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2320 // We weren't able to make any changes, so delete the list so we don't
2321 // process the same instructions the next time we try to optimize this
2322 // block.
2323 I = MergeableInsts.erase(I);
2324 continue;
2325 }
2326
2327 Modified = true;
2328
2329 // We made changes, but also determined that there were no more optimization
2330 // opportunities, so we don't need to reprocess the list
2331 if (!OptimizeListAgain) {
2332 I = MergeableInsts.erase(I);
2333 continue;
2334 }
2335 OptimizeAgain = true;
2336 }
2337 return Modified;
2338}
2339
2340bool
2341SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2342 std::list<CombineInfo> &MergeList,
2343 bool &OptimizeListAgain) {
2344 if (MergeList.empty())
2345 return false;
2346
2347 bool Modified = false;
2348
2349 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2350 Next = std::next(I)) {
2351
2352 auto First = I;
2353 auto Second = Next;
2354
2355 if ((*First).Order > (*Second).Order)
2356 std::swap(First, Second);
2357 CombineInfo &CI = *First;
2358 CombineInfo &Paired = *Second;
2359
2360 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2361 if (!Where) {
2362 ++I;
2363 continue;
2364 }
2365
2366 Modified = true;
2367
2368 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2369
2371 switch (CI.InstClass) {
2372 default:
2373 llvm_unreachable("unknown InstClass");
2374 break;
2375 case DS_READ:
2376 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2377 break;
2378 case DS_WRITE:
2379 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2380 break;
2381 case S_BUFFER_LOAD_IMM:
2382 case S_BUFFER_LOAD_SGPR_IMM:
2383 case S_LOAD_IMM:
2384 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2385 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2386 break;
2387 case BUFFER_LOAD:
2388 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2389 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2390 break;
2391 case BUFFER_STORE:
2392 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2393 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2394 break;
2395 case MIMG:
2396 NewMI = mergeImagePair(CI, Paired, Where->I);
2397 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2398 break;
2399 case TBUFFER_LOAD:
2400 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2401 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2402 break;
2403 case TBUFFER_STORE:
2404 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2405 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2406 break;
2407 case FLAT_LOAD:
2408 case GLOBAL_LOAD:
2409 case GLOBAL_LOAD_SADDR:
2410 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2411 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2412 break;
2413 case FLAT_STORE:
2414 case GLOBAL_STORE:
2415 case GLOBAL_STORE_SADDR:
2416 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2417 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2418 break;
2419 }
2420 CI.setMI(NewMI, *this);
2421 CI.Order = Where->Order;
2422 if (I == Second)
2423 I = Next;
2424
2425 MergeList.erase(Second);
2426 }
2427
2428 return Modified;
2429}
2430
2431bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2432 if (skipFunction(MF.getFunction()))
2433 return false;
2434
2435 STM = &MF.getSubtarget<GCNSubtarget>();
2436 if (!STM->loadStoreOptEnabled())
2437 return false;
2438
2439 TII = STM->getInstrInfo();
2440 TRI = &TII->getRegisterInfo();
2441
2442 MRI = &MF.getRegInfo();
2443 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2444
2445 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2446
2447 bool Modified = false;
2448
2449 // Contains the list of instructions for which constant offsets are being
2450 // promoted to the IMM. This is tracked for an entire block at time.
2452 MemInfoMap Visited;
2453
2454 for (MachineBasicBlock &MBB : MF) {
2455 MachineBasicBlock::iterator SectionEnd;
2456 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2457 I = SectionEnd) {
2458 bool CollectModified;
2459 std::list<std::list<CombineInfo>> MergeableInsts;
2460
2461 // First pass: Collect list of all instructions we know how to merge in a
2462 // subset of the block.
2463 std::tie(SectionEnd, CollectModified) =
2464 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2465
2466 Modified |= CollectModified;
2467
2468 do {
2469 OptimizeAgain = false;
2470 Modified |= optimizeBlock(MergeableInsts);
2471 } while (OptimizeAgain);
2472 }
2473
2474 Visited.clear();
2475 AnchorList.clear();
2476 }
2477
2478 return Modified;
2479}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1272
bool End
Definition: ELF_riscv.cpp:469
AMD GCN specific subclass of TargetSubtarget.
#define op(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
SI Load Store Optimizer
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
#define DEBUG_TYPE
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition: APInt.h:76
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool loadStoreOptEnabled() const
Definition: GCNSubtarget.h:988
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:233
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:241
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:650
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
uint64_t getSize() const
Return the size in bytes of the memory reference.
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
void dump() const
Definition: Pass.cpp:136
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:576
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:384
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:390
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:119
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:440
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:361
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:349
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:179
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:245
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:184
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...