LLVM 22.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
61#include "AMDGPU.h"
62#include "GCNSubtarget.h"
64#include "SIDefines.h"
68
69using namespace llvm;
70
71#define DEBUG_TYPE "si-load-store-opt"
72
73namespace {
74enum InstClassEnum {
75 UNKNOWN,
76 DS_READ,
77 DS_WRITE,
78 S_BUFFER_LOAD_IMM,
79 S_BUFFER_LOAD_SGPR_IMM,
80 S_LOAD_IMM,
81 BUFFER_LOAD,
82 BUFFER_STORE,
83 MIMG,
84 TBUFFER_LOAD,
85 TBUFFER_STORE,
86 GLOBAL_LOAD_SADDR,
87 GLOBAL_STORE_SADDR,
88 FLAT_LOAD,
89 FLAT_STORE,
90 FLAT_LOAD_SADDR,
91 FLAT_STORE_SADDR,
92 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
93 GLOBAL_STORE // any CombineInfo, they are only ever returned by
94 // getCommonInstClass.
95};
96
97struct AddressRegs {
98 unsigned char NumVAddrs = 0;
99 bool SBase = false;
100 bool SRsrc = false;
101 bool SOffset = false;
102 bool SAddr = false;
103 bool VAddr = false;
104 bool Addr = false;
105 bool SSamp = false;
106};
107
108// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
109const unsigned MaxAddressRegs = 12 + 1 + 1;
110
111class SILoadStoreOptimizer {
112 struct CombineInfo {
114 unsigned EltSize;
115 unsigned Offset;
116 unsigned Width;
117 unsigned Format;
118 unsigned BaseOff;
119 unsigned DMask;
120 InstClassEnum InstClass;
121 unsigned CPol = 0;
122 const TargetRegisterClass *DataRC;
123 bool UseST64;
124 int AddrIdx[MaxAddressRegs];
125 const MachineOperand *AddrReg[MaxAddressRegs];
126 unsigned NumAddresses;
127 unsigned Order;
128
129 bool hasSameBaseAddress(const CombineInfo &CI) {
130 if (NumAddresses != CI.NumAddresses)
131 return false;
132
133 const MachineInstr &MI = *CI.I;
134 for (unsigned i = 0; i < NumAddresses; i++) {
135 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
136
137 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
138 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
139 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
140 return false;
141 }
142 continue;
143 }
144
145 // Check same base pointer. Be careful of subregisters, which can occur
146 // with vectors of pointers.
147 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
148 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
149 return false;
150 }
151 }
152 return true;
153 }
154
155 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
156 for (unsigned i = 0; i < NumAddresses; ++i) {
157 const MachineOperand *AddrOp = AddrReg[i];
158 // Immediates are always OK.
159 if (AddrOp->isImm())
160 continue;
161
162 // Don't try to merge addresses that aren't either immediates or registers.
163 // TODO: Should be possible to merge FrameIndexes and maybe some other
164 // non-register
165 if (!AddrOp->isReg())
166 return false;
167
168 // TODO: We should be able to merge instructions with other physical reg
169 // addresses too.
170 if (AddrOp->getReg().isPhysical() &&
171 AddrOp->getReg() != AMDGPU::SGPR_NULL)
172 return false;
173
174 // If an address has only one use then there will be no other
175 // instructions with the same address, so we can't merge this one.
176 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
177 return false;
178 }
179 return true;
180 }
181
182 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
183
184 // Compare by pointer order.
185 bool operator<(const CombineInfo& Other) const {
186 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
187 }
188 };
189
190 struct BaseRegisters {
191 Register LoReg;
192 Register HiReg;
193
194 unsigned LoSubReg = 0;
195 unsigned HiSubReg = 0;
196 };
197
198 struct MemAddress {
199 BaseRegisters Base;
200 int64_t Offset = 0;
201 };
202
203 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
204
205private:
206 MachineFunction *MF = nullptr;
207 const GCNSubtarget *STM = nullptr;
208 const SIInstrInfo *TII = nullptr;
209 const SIRegisterInfo *TRI = nullptr;
210 MachineRegisterInfo *MRI = nullptr;
211 AliasAnalysis *AA = nullptr;
212 bool OptimizeAgain;
213
214 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
215 const DenseSet<Register> &ARegUses,
216 const MachineInstr &A, const MachineInstr &B) const;
217 static bool dmasksCanBeCombined(const CombineInfo &CI,
218 const SIInstrInfo &TII,
219 const CombineInfo &Paired);
220 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
221 CombineInfo &Paired, bool Modify = false);
222 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
223 const CombineInfo &Paired);
224 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
225 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
226 const CombineInfo &Paired);
227 const TargetRegisterClass *
228 getTargetRegisterClass(const CombineInfo &CI,
229 const CombineInfo &Paired) const;
230 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
231
232 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
233
234 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
235 MachineBasicBlock::iterator InsertBefore,
236 const DebugLoc &DL, AMDGPU::OpName OpName,
237 Register DestReg) const;
238 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
239 MachineBasicBlock::iterator InsertBefore,
240 const DebugLoc &DL, AMDGPU::OpName OpName) const;
241
242 unsigned read2Opcode(unsigned EltSize) const;
243 unsigned read2ST64Opcode(unsigned EltSize) const;
245 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
246 MachineBasicBlock::iterator InsertBefore);
247
248 unsigned write2Opcode(unsigned EltSize) const;
249 unsigned write2ST64Opcode(unsigned EltSize) const;
250 unsigned getWrite2Opcode(const CombineInfo &CI) const;
251
253 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
254 MachineBasicBlock::iterator InsertBefore);
256 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
257 MachineBasicBlock::iterator InsertBefore);
259 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
260 MachineBasicBlock::iterator InsertBefore);
262 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
263 MachineBasicBlock::iterator InsertBefore);
265 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
266 MachineBasicBlock::iterator InsertBefore);
268 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
269 MachineBasicBlock::iterator InsertBefore);
271 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
272 MachineBasicBlock::iterator InsertBefore);
274 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
275 MachineBasicBlock::iterator InsertBefore);
277 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
278 MachineBasicBlock::iterator InsertBefore);
279
280 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
281 int32_t NewOffset) const;
282 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
283 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
284 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
285 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
286 /// Promotes constant offset to the immediate by adjusting the base. It
287 /// tries to use a base from the nearby instructions that allows it to have
288 /// a 13bit constant offset which gets promoted to the immediate.
289 bool promoteConstantOffsetToImm(MachineInstr &CI,
290 MemInfoMap &Visited,
291 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
292 void addInstToMergeableList(const CombineInfo &CI,
293 std::list<std::list<CombineInfo> > &MergeableInsts) const;
294
295 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
297 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
298 std::list<std::list<CombineInfo>> &MergeableInsts) const;
299
300 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
301 const CombineInfo &Paired);
302
303 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
304 const CombineInfo &Paired);
305
306 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
307 bool &OptimizeListAgain);
308 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
309
310public:
311 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
312 bool run(MachineFunction &MF);
313};
314
315class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
316public:
317 static char ID;
318
319 SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
320
321 bool runOnMachineFunction(MachineFunction &MF) override;
322
323 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
324
325 void getAnalysisUsage(AnalysisUsage &AU) const override {
326 AU.setPreservesCFG();
328
330 }
331
332 MachineFunctionProperties getRequiredProperties() const override {
333 return MachineFunctionProperties().setIsSSA();
334 }
335};
336
337static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
338 const unsigned Opc = MI.getOpcode();
339
340 if (TII.isMUBUF(Opc)) {
341 // FIXME: Handle d16 correctly
343 }
344 if (TII.isImage(MI)) {
345 uint64_t DMaskImm =
346 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
347 return llvm::popcount(DMaskImm);
348 }
349 if (TII.isMTBUF(Opc)) {
351 }
352
353 switch (Opc) {
354 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
355 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
356 case AMDGPU::S_LOAD_DWORD_IMM:
357 case AMDGPU::GLOBAL_LOAD_DWORD:
358 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
359 case AMDGPU::GLOBAL_STORE_DWORD:
360 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
361 case AMDGPU::FLAT_LOAD_DWORD:
362 case AMDGPU::FLAT_STORE_DWORD:
363 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
364 case AMDGPU::FLAT_STORE_DWORD_SADDR:
365 return 1;
366 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
367 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
368 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
369 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
370 case AMDGPU::S_LOAD_DWORDX2_IMM:
371 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
372 case AMDGPU::GLOBAL_LOAD_DWORDX2:
373 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
374 case AMDGPU::GLOBAL_STORE_DWORDX2:
375 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
376 case AMDGPU::FLAT_LOAD_DWORDX2:
377 case AMDGPU::FLAT_STORE_DWORDX2:
378 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
379 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
380 return 2;
381 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
382 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
383 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
384 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
385 case AMDGPU::S_LOAD_DWORDX3_IMM:
386 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
387 case AMDGPU::GLOBAL_LOAD_DWORDX3:
388 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
389 case AMDGPU::GLOBAL_STORE_DWORDX3:
390 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
391 case AMDGPU::FLAT_LOAD_DWORDX3:
392 case AMDGPU::FLAT_STORE_DWORDX3:
393 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
394 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
395 return 3;
396 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
397 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
398 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
399 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
400 case AMDGPU::S_LOAD_DWORDX4_IMM:
401 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
402 case AMDGPU::GLOBAL_LOAD_DWORDX4:
403 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
404 case AMDGPU::GLOBAL_STORE_DWORDX4:
405 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
406 case AMDGPU::FLAT_LOAD_DWORDX4:
407 case AMDGPU::FLAT_STORE_DWORDX4:
408 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
409 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
410 return 4;
411 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
412 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
413 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
414 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
415 case AMDGPU::S_LOAD_DWORDX8_IMM:
416 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
417 return 8;
418 case AMDGPU::DS_READ_B32:
419 case AMDGPU::DS_READ_B32_gfx9:
420 case AMDGPU::DS_WRITE_B32:
421 case AMDGPU::DS_WRITE_B32_gfx9:
422 return 1;
423 case AMDGPU::DS_READ_B64:
424 case AMDGPU::DS_READ_B64_gfx9:
425 case AMDGPU::DS_WRITE_B64:
426 case AMDGPU::DS_WRITE_B64_gfx9:
427 return 2;
428 default:
429 return 0;
430 }
431}
432
433/// Maps instruction opcode to enum InstClassEnum.
434static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
435 switch (Opc) {
436 default:
437 if (TII.isMUBUF(Opc)) {
439 default:
440 return UNKNOWN;
441 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
442 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
443 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
444 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
445 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
446 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
447 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
448 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
449 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
450 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
451 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
452 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
453 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
454 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
455 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
456 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
457 return BUFFER_LOAD;
458 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
459 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
460 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
461 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
462 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
463 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
464 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
465 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
466 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
467 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
468 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
469 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
470 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
471 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
472 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
473 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
474 return BUFFER_STORE;
475 }
476 }
477 if (TII.isImage(Opc)) {
478 // Ignore instructions encoded without vaddr.
479 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
480 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
481 return UNKNOWN;
482 // Ignore BVH instructions
484 return UNKNOWN;
485 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
486 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
487 TII.isGather4(Opc))
488 return UNKNOWN;
489 return MIMG;
490 }
491 if (TII.isMTBUF(Opc)) {
493 default:
494 return UNKNOWN;
495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
496 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
497 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
498 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
499 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
507 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
508 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
509 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
510 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
511 return TBUFFER_LOAD;
512 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
513 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
514 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
515 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
516 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
517 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
518 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
519 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
520 return TBUFFER_STORE;
521 }
522 }
523 return UNKNOWN;
524 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
525 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
529 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
530 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
533 return S_BUFFER_LOAD_IMM;
534 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
535 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
536 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
537 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
538 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
539 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
540 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
541 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
542 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
543 return S_BUFFER_LOAD_SGPR_IMM;
544 case AMDGPU::S_LOAD_DWORD_IMM:
545 case AMDGPU::S_LOAD_DWORDX2_IMM:
546 case AMDGPU::S_LOAD_DWORDX3_IMM:
547 case AMDGPU::S_LOAD_DWORDX4_IMM:
548 case AMDGPU::S_LOAD_DWORDX8_IMM:
549 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
550 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
551 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
552 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
553 return S_LOAD_IMM;
554 case AMDGPU::DS_READ_B32:
555 case AMDGPU::DS_READ_B32_gfx9:
556 case AMDGPU::DS_READ_B64:
557 case AMDGPU::DS_READ_B64_gfx9:
558 return DS_READ;
559 case AMDGPU::DS_WRITE_B32:
560 case AMDGPU::DS_WRITE_B32_gfx9:
561 case AMDGPU::DS_WRITE_B64:
562 case AMDGPU::DS_WRITE_B64_gfx9:
563 return DS_WRITE;
564 case AMDGPU::GLOBAL_LOAD_DWORD:
565 case AMDGPU::GLOBAL_LOAD_DWORDX2:
566 case AMDGPU::GLOBAL_LOAD_DWORDX3:
567 case AMDGPU::GLOBAL_LOAD_DWORDX4:
568 case AMDGPU::FLAT_LOAD_DWORD:
569 case AMDGPU::FLAT_LOAD_DWORDX2:
570 case AMDGPU::FLAT_LOAD_DWORDX3:
571 case AMDGPU::FLAT_LOAD_DWORDX4:
572 return FLAT_LOAD;
573 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
574 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
575 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
576 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
577 return GLOBAL_LOAD_SADDR;
578 case AMDGPU::GLOBAL_STORE_DWORD:
579 case AMDGPU::GLOBAL_STORE_DWORDX2:
580 case AMDGPU::GLOBAL_STORE_DWORDX3:
581 case AMDGPU::GLOBAL_STORE_DWORDX4:
582 case AMDGPU::FLAT_STORE_DWORD:
583 case AMDGPU::FLAT_STORE_DWORDX2:
584 case AMDGPU::FLAT_STORE_DWORDX3:
585 case AMDGPU::FLAT_STORE_DWORDX4:
586 return FLAT_STORE;
587 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
588 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
589 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
590 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
591 return GLOBAL_STORE_SADDR;
592 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
593 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
594 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
595 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
596 return FLAT_LOAD_SADDR;
597 case AMDGPU::FLAT_STORE_DWORD_SADDR:
598 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
599 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
600 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
601 return FLAT_STORE_SADDR;
602 }
603}
604
605/// Determines instruction subclass from opcode. Only instructions
606/// of the same subclass can be merged together. The merged instruction may have
607/// a different subclass but must have the same class.
608static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
609 switch (Opc) {
610 default:
611 if (TII.isMUBUF(Opc))
613 if (TII.isImage(Opc)) {
615 assert(Info);
616 return Info->BaseOpcode;
617 }
618 if (TII.isMTBUF(Opc))
620 return -1;
621 case AMDGPU::DS_READ_B32:
622 case AMDGPU::DS_READ_B32_gfx9:
623 case AMDGPU::DS_READ_B64:
624 case AMDGPU::DS_READ_B64_gfx9:
625 case AMDGPU::DS_WRITE_B32:
626 case AMDGPU::DS_WRITE_B32_gfx9:
627 case AMDGPU::DS_WRITE_B64:
628 case AMDGPU::DS_WRITE_B64_gfx9:
629 return Opc;
630 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
631 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
632 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
633 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
634 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
635 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
636 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
637 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
638 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
639 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
640 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
642 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
643 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
644 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
645 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
646 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
647 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
648 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
649 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
650 case AMDGPU::S_LOAD_DWORD_IMM:
651 case AMDGPU::S_LOAD_DWORDX2_IMM:
652 case AMDGPU::S_LOAD_DWORDX3_IMM:
653 case AMDGPU::S_LOAD_DWORDX4_IMM:
654 case AMDGPU::S_LOAD_DWORDX8_IMM:
655 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
656 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
657 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
658 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
659 return AMDGPU::S_LOAD_DWORD_IMM;
660 case AMDGPU::GLOBAL_LOAD_DWORD:
661 case AMDGPU::GLOBAL_LOAD_DWORDX2:
662 case AMDGPU::GLOBAL_LOAD_DWORDX3:
663 case AMDGPU::GLOBAL_LOAD_DWORDX4:
664 case AMDGPU::FLAT_LOAD_DWORD:
665 case AMDGPU::FLAT_LOAD_DWORDX2:
666 case AMDGPU::FLAT_LOAD_DWORDX3:
667 case AMDGPU::FLAT_LOAD_DWORDX4:
668 return AMDGPU::FLAT_LOAD_DWORD;
669 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
670 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
671 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
672 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
673 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
674 case AMDGPU::GLOBAL_STORE_DWORD:
675 case AMDGPU::GLOBAL_STORE_DWORDX2:
676 case AMDGPU::GLOBAL_STORE_DWORDX3:
677 case AMDGPU::GLOBAL_STORE_DWORDX4:
678 case AMDGPU::FLAT_STORE_DWORD:
679 case AMDGPU::FLAT_STORE_DWORDX2:
680 case AMDGPU::FLAT_STORE_DWORDX3:
681 case AMDGPU::FLAT_STORE_DWORDX4:
682 return AMDGPU::FLAT_STORE_DWORD;
683 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
684 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
685 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
686 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
687 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
688 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
689 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
690 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
691 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
692 return AMDGPU::FLAT_LOAD_DWORD_SADDR;
693 case AMDGPU::FLAT_STORE_DWORD_SADDR:
694 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
695 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
696 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
697 return AMDGPU::FLAT_STORE_DWORD_SADDR;
698 }
699}
700
701// GLOBAL loads and stores are classified as FLAT initially. If both combined
702// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
703// If either or both instructions are non segment specific FLAT the resulting
704// combined operation will be FLAT, potentially promoting one of the GLOBAL
705// operations to FLAT.
706// For other instructions return the original unmodified class.
707InstClassEnum
708SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
709 const CombineInfo &Paired) {
710 assert(CI.InstClass == Paired.InstClass);
711
712 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
714 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
715
716 return CI.InstClass;
717}
718
719static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
720 AddressRegs Result;
721
722 if (TII.isMUBUF(Opc)) {
724 Result.VAddr = true;
726 Result.SRsrc = true;
728 Result.SOffset = true;
729
730 return Result;
731 }
732
733 if (TII.isImage(Opc)) {
734 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
735 if (VAddr0Idx >= 0) {
736 AMDGPU::OpName RsrcName =
737 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
738 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
739 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
740 } else {
741 Result.VAddr = true;
742 }
743 Result.SRsrc = true;
745 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
746 Result.SSamp = true;
747
748 return Result;
749 }
750 if (TII.isMTBUF(Opc)) {
752 Result.VAddr = true;
754 Result.SRsrc = true;
756 Result.SOffset = true;
757
758 return Result;
759 }
760
761 switch (Opc) {
762 default:
763 return Result;
764 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
765 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
766 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
767 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
768 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
769 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
770 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
771 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
772 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
773 Result.SOffset = true;
774 [[fallthrough]];
775 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
776 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
777 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
778 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
779 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
780 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
781 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
782 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
783 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
784 case AMDGPU::S_LOAD_DWORD_IMM:
785 case AMDGPU::S_LOAD_DWORDX2_IMM:
786 case AMDGPU::S_LOAD_DWORDX3_IMM:
787 case AMDGPU::S_LOAD_DWORDX4_IMM:
788 case AMDGPU::S_LOAD_DWORDX8_IMM:
789 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
790 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
791 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
792 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
793 Result.SBase = true;
794 return Result;
795 case AMDGPU::DS_READ_B32:
796 case AMDGPU::DS_READ_B64:
797 case AMDGPU::DS_READ_B32_gfx9:
798 case AMDGPU::DS_READ_B64_gfx9:
799 case AMDGPU::DS_WRITE_B32:
800 case AMDGPU::DS_WRITE_B64:
801 case AMDGPU::DS_WRITE_B32_gfx9:
802 case AMDGPU::DS_WRITE_B64_gfx9:
803 Result.Addr = true;
804 return Result;
805 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
806 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
807 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
808 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
809 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
810 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
811 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
812 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
813 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
814 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
815 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
816 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
817 case AMDGPU::FLAT_STORE_DWORD_SADDR:
818 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
819 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
820 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
821 Result.SAddr = true;
822 [[fallthrough]];
823 case AMDGPU::GLOBAL_LOAD_DWORD:
824 case AMDGPU::GLOBAL_LOAD_DWORDX2:
825 case AMDGPU::GLOBAL_LOAD_DWORDX3:
826 case AMDGPU::GLOBAL_LOAD_DWORDX4:
827 case AMDGPU::GLOBAL_STORE_DWORD:
828 case AMDGPU::GLOBAL_STORE_DWORDX2:
829 case AMDGPU::GLOBAL_STORE_DWORDX3:
830 case AMDGPU::GLOBAL_STORE_DWORDX4:
831 case AMDGPU::FLAT_LOAD_DWORD:
832 case AMDGPU::FLAT_LOAD_DWORDX2:
833 case AMDGPU::FLAT_LOAD_DWORDX3:
834 case AMDGPU::FLAT_LOAD_DWORDX4:
835 case AMDGPU::FLAT_STORE_DWORD:
836 case AMDGPU::FLAT_STORE_DWORDX2:
837 case AMDGPU::FLAT_STORE_DWORDX3:
838 case AMDGPU::FLAT_STORE_DWORDX4:
839 Result.VAddr = true;
840 return Result;
841 }
842}
843
844void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
845 const SILoadStoreOptimizer &LSO) {
846 I = MI;
847 unsigned Opc = MI->getOpcode();
848 InstClass = getInstClass(Opc, *LSO.TII);
849
850 if (InstClass == UNKNOWN)
851 return;
852
853 DataRC = LSO.getDataRegClass(*MI);
854
855 switch (InstClass) {
856 case DS_READ:
857 EltSize =
858 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
859 : 4;
860 break;
861 case DS_WRITE:
862 EltSize =
863 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
864 : 4;
865 break;
866 case S_BUFFER_LOAD_IMM:
867 case S_BUFFER_LOAD_SGPR_IMM:
868 case S_LOAD_IMM:
869 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
870 break;
871 default:
872 EltSize = 4;
873 break;
874 }
875
876 if (InstClass == MIMG) {
877 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
878 // Offset is not considered for MIMG instructions.
879 Offset = 0;
880 } else {
881 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
882 Offset = I->getOperand(OffsetIdx).getImm();
883 }
884
885 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
886 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
888 AMDGPU::getGcnBufferFormatInfo(Format, *LSO.STM);
889 EltSize = Info->BitsPerComp / 8;
890 }
891
892 Width = getOpcodeWidth(*I, *LSO.TII);
893
894 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
895 Offset &= 0xffff;
896 } else if (InstClass != MIMG) {
897 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
898 }
899
900 AddressRegs Regs = getRegs(Opc, *LSO.TII);
901 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
902
903 NumAddresses = 0;
904 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
905 AddrIdx[NumAddresses++] =
906 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
907 if (Regs.Addr)
908 AddrIdx[NumAddresses++] =
909 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
910 if (Regs.SBase)
911 AddrIdx[NumAddresses++] =
912 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
913 if (Regs.SRsrc)
914 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
915 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
916 if (Regs.SOffset)
917 AddrIdx[NumAddresses++] =
918 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
919 if (Regs.SAddr)
920 AddrIdx[NumAddresses++] =
921 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
922 if (Regs.VAddr)
923 AddrIdx[NumAddresses++] =
924 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
925 if (Regs.SSamp)
926 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
927 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
928 assert(NumAddresses <= MaxAddressRegs);
929
930 for (unsigned J = 0; J < NumAddresses; J++)
931 AddrReg[J] = &I->getOperand(AddrIdx[J]);
932}
933
934} // end anonymous namespace.
935
936INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
937 "SI Load Store Optimizer", false, false)
939INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
940 "SI Load Store Optimizer", false, false)
941
942char SILoadStoreOptimizerLegacy::ID = 0;
943
944char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
945
947 return new SILoadStoreOptimizerLegacy();
948}
949
951 DenseSet<Register> &RegDefs,
952 DenseSet<Register> &RegUses) {
953 for (const auto &Op : MI.operands()) {
954 if (!Op.isReg())
955 continue;
956 if (Op.isDef())
957 RegDefs.insert(Op.getReg());
958 if (Op.readsReg())
959 RegUses.insert(Op.getReg());
960 }
961}
962
963bool SILoadStoreOptimizer::canSwapInstructions(
964 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
965 const MachineInstr &A, const MachineInstr &B) const {
966 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
967 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
968 return false;
969 for (const auto &BOp : B.operands()) {
970 if (!BOp.isReg())
971 continue;
972 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
973 return false;
974 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
975 return false;
976 }
977 return true;
978}
979
980// Given that \p CI and \p Paired are adjacent memory operations produce a new
981// MMO for the combined operation with a new access size.
982MachineMemOperand *
983SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
984 const CombineInfo &Paired) {
985 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
986 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
987
988 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
989
990 // A base pointer for the combined operation is the same as the leading
991 // operation's pointer.
992 if (Paired < CI)
993 std::swap(MMOa, MMOb);
994
995 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
996 // If merging FLAT and GLOBAL set address space to FLAT.
998 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
999
1000 MachineFunction *MF = CI.I->getMF();
1001 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
1002}
1003
1004bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
1005 const SIInstrInfo &TII,
1006 const CombineInfo &Paired) {
1007 assert(CI.InstClass == MIMG);
1008
1009 // Ignore instructions with tfe/lwe set.
1010 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
1011 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
1012
1013 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
1014 return false;
1015
1016 // Check other optional immediate operands for equality.
1017 AMDGPU::OpName OperandsToMatch[] = {
1018 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
1019 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
1020
1021 for (AMDGPU::OpName op : OperandsToMatch) {
1022 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
1023 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
1024 return false;
1025 if (Idx != -1 &&
1026 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
1027 return false;
1028 }
1029
1030 // Check DMask for overlaps.
1031 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
1032 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
1033
1034 if (!MaxMask)
1035 return false;
1036
1037 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
1038 if ((1u << AllowedBitsForMin) <= MinMask)
1039 return false;
1040
1041 return true;
1042}
1043
1044static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
1045 unsigned ComponentCount,
1046 const GCNSubtarget &STI) {
1047 if (ComponentCount > 4)
1048 return 0;
1049
1050 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1052 if (!OldFormatInfo)
1053 return 0;
1054
1055 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1057 ComponentCount,
1058 OldFormatInfo->NumFormat, STI);
1059
1060 if (!NewFormatInfo)
1061 return 0;
1062
1063 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1064 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1065
1066 return NewFormatInfo->Format;
1067}
1068
1069// Return the value in the inclusive range [Lo,Hi] that is aligned to the
1070// highest power of two. Note that the result is well defined for all inputs
1071// including corner cases like:
1072// - if Lo == Hi, return that value
1073// - if Lo == 0, return 0 (even though the "- 1" below underflows
1074// - if Lo > Hi, return 0 (as if the range wrapped around)
1078
1079bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1080 const GCNSubtarget &STI,
1081 CombineInfo &Paired,
1082 bool Modify) {
1083 assert(CI.InstClass != MIMG);
1084
1085 // XXX - Would the same offset be OK? Is there any reason this would happen or
1086 // be useful?
1087 if (CI.Offset == Paired.Offset)
1088 return false;
1089
1090 // This won't be valid if the offset isn't aligned.
1091 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1092 return false;
1093
1094 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1095
1096 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1098 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1099 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1100
1101 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1102 Info0->NumFormat != Info1->NumFormat)
1103 return false;
1104
1105 // For 8-bit or 16-bit formats there is no 3-component variant.
1106 // If NumCombinedComponents is 3, try the 4-component format and use XYZ.
1107 // Example:
1108 // tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x
1109 // ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM]
1110 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1111 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1112 NumCombinedComponents = 4;
1113
1114 if (getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, STI) ==
1115 0)
1116 return false;
1117
1118 // Merge only when the two access ranges are strictly back-to-back,
1119 // any gap or overlap can over-write data or leave holes.
1120 unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1121 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1122 if (ElemIndex0 + CI.Width != ElemIndex1 &&
1123 ElemIndex1 + Paired.Width != ElemIndex0)
1124 return false;
1125
1126 // 1-byte formats require 1-byte alignment.
1127 // 2-byte formats require 2-byte alignment.
1128 // 4-byte and larger formats require 4-byte alignment.
1129 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1130 unsigned RequiredAlign = std::min(MergedBytes, 4u);
1131 unsigned MinOff = std::min(CI.Offset, Paired.Offset);
1132 if (MinOff % RequiredAlign != 0)
1133 return false;
1134
1135 return true;
1136 }
1137
1138 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1139 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1140 CI.UseST64 = false;
1141 CI.BaseOff = 0;
1142
1143 // Handle all non-DS instructions.
1144 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1145 if (EltOffset0 + CI.Width != EltOffset1 &&
1146 EltOffset1 + Paired.Width != EltOffset0)
1147 return false;
1148 // Instructions with scale_offset modifier cannot be combined unless we
1149 // also generate a code to scale the offset and reset that bit.
1150 if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
1151 return false;
1152 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1153 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1154 // Reject cases like:
1155 // dword + dwordx2 -> dwordx3
1156 // dword + dwordx3 -> dwordx4
1157 // If we tried to combine these cases, we would fail to extract a subreg
1158 // for the result of the second load due to SGPR alignment requirements.
1159 if (CI.Width != Paired.Width &&
1160 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1161 return false;
1162 }
1163 return true;
1164 }
1165
1166 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1167 // the stride 64 versions.
1168 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1169 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1170 if (Modify) {
1171 CI.Offset = EltOffset0 / 64;
1172 Paired.Offset = EltOffset1 / 64;
1173 CI.UseST64 = true;
1174 }
1175 return true;
1176 }
1177
1178 // Check if the new offsets fit in the reduced 8-bit range.
1179 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1180 if (Modify) {
1181 CI.Offset = EltOffset0;
1182 Paired.Offset = EltOffset1;
1183 }
1184 return true;
1185 }
1186
1187 // Try to shift base address to decrease offsets.
1188 uint32_t Min = std::min(EltOffset0, EltOffset1);
1189 uint32_t Max = std::max(EltOffset0, EltOffset1);
1190
1191 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1192 if (((Max - Min) & ~Mask) == 0) {
1193 if (Modify) {
1194 // From the range of values we could use for BaseOff, choose the one that
1195 // is aligned to the highest power of two, to maximise the chance that
1196 // the same offset can be reused for other load/store pairs.
1197 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1198 // Copy the low bits of the offsets, so that when we adjust them by
1199 // subtracting BaseOff they will be multiples of 64.
1200 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1201 CI.BaseOff = BaseOff * CI.EltSize;
1202 CI.Offset = (EltOffset0 - BaseOff) / 64;
1203 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1204 CI.UseST64 = true;
1205 }
1206 return true;
1207 }
1208
1209 if (isUInt<8>(Max - Min)) {
1210 if (Modify) {
1211 // From the range of values we could use for BaseOff, choose the one that
1212 // is aligned to the highest power of two, to maximise the chance that
1213 // the same offset can be reused for other load/store pairs.
1214 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1215 CI.BaseOff = BaseOff * CI.EltSize;
1216 CI.Offset = EltOffset0 - BaseOff;
1217 Paired.Offset = EltOffset1 - BaseOff;
1218 }
1219 return true;
1220 }
1221
1222 return false;
1223}
1224
1225bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1226 const CombineInfo &CI,
1227 const CombineInfo &Paired) {
1228 const unsigned Width = (CI.Width + Paired.Width);
1229 switch (CI.InstClass) {
1230 default:
1231 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1232 case S_BUFFER_LOAD_IMM:
1233 case S_BUFFER_LOAD_SGPR_IMM:
1234 case S_LOAD_IMM:
1235 switch (Width) {
1236 default:
1237 return false;
1238 case 2:
1239 case 4:
1240 case 8:
1241 return true;
1242 case 3:
1243 return STM.hasScalarDwordx3Loads();
1244 }
1245 }
1246}
1247
1248const TargetRegisterClass *
1249SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1250 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1251 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1252 }
1253 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1254 return TRI->getRegClassForReg(*MRI, Src->getReg());
1255 }
1256 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1257 return TRI->getRegClassForReg(*MRI, Src->getReg());
1258 }
1259 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1260 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1261 }
1262 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1263 return TRI->getRegClassForReg(*MRI, Src->getReg());
1264 }
1265 return nullptr;
1266}
1267
1268/// This function assumes that CI comes before Paired in a basic block. Return
1269/// an insertion point for the merged instruction or nullptr on failure.
1270SILoadStoreOptimizer::CombineInfo *
1271SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1272 CombineInfo &Paired) {
1273 // If another instruction has already been merged into CI, it may now be a
1274 // type that we can't do any further merging into.
1275 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1276 return nullptr;
1277 assert(CI.InstClass == Paired.InstClass);
1278
1279 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1280 getInstSubclass(Paired.I->getOpcode(), *TII))
1281 return nullptr;
1282
1283 // Check both offsets (or masks for MIMG) can be combined and fit in the
1284 // reduced range.
1285 if (CI.InstClass == MIMG) {
1286 if (!dmasksCanBeCombined(CI, *TII, Paired))
1287 return nullptr;
1288 } else {
1289 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1290 return nullptr;
1291 }
1292
1293 DenseSet<Register> RegDefs;
1294 DenseSet<Register> RegUses;
1295 CombineInfo *Where;
1296 if (CI.I->mayLoad()) {
1297 // Try to hoist Paired up to CI.
1298 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1299 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1300 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1301 return nullptr;
1302 }
1303 Where = &CI;
1304 } else {
1305 // Try to sink CI down to Paired.
1306 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1307 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1308 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1309 return nullptr;
1310 }
1311 Where = &Paired;
1312 }
1313
1314 // Call offsetsCanBeCombined with modify = true so that the offsets are
1315 // correct for the new instruction. This should return true, because
1316 // this function should only be called on CombineInfo objects that
1317 // have already been confirmed to be mergeable.
1318 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1319 offsetsCanBeCombined(CI, *STM, Paired, true);
1320
1321 if (CI.InstClass == DS_WRITE) {
1322 // Both data operands must be AGPR or VGPR, so the data registers needs to
1323 // be constrained to one or the other. We expect to only emit the VGPR form
1324 // here for now.
1325 //
1326 // FIXME: There is currently a hack in getRegClass to report that the write2
1327 // operands are VGPRs. In the future we should have separate agpr
1328 // instruction definitions.
1329 const MachineOperand *Data0 =
1330 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1331 const MachineOperand *Data1 =
1332 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1333
1334 const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI));
1335 int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
1336 AMDGPU::OpName::data0);
1337 int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
1338 AMDGPU::OpName::data1);
1339
1340 const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx);
1341
1342 const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx);
1343
1344 if (unsigned SubReg = Data0->getSubReg()) {
1345 DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
1346 DataRC0, SubReg);
1347 }
1348
1349 if (unsigned SubReg = Data1->getSubReg()) {
1350 DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()),
1351 DataRC1, SubReg);
1352 }
1353
1354 if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) ||
1355 !MRI->constrainRegClass(Data1->getReg(), DataRC1))
1356 return nullptr;
1357
1358 // TODO: If one register can be constrained, and not the other, insert a
1359 // copy.
1360 }
1361
1362 return Where;
1363}
1364
1365// Copy the merged load result from DestReg to the original dest regs of CI and
1366// Paired.
1367void SILoadStoreOptimizer::copyToDestRegs(
1368 CombineInfo &CI, CombineInfo &Paired,
1369 MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,
1370 AMDGPU::OpName OpName, Register DestReg) const {
1371 MachineBasicBlock *MBB = CI.I->getParent();
1372
1373 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1374
1375 // Copy to the old destination registers.
1376 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1377 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1378 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1379
1380 // The constrained sload instructions in S_LOAD_IMM class will have
1381 // `early-clobber` flag in the dst operand. Remove the flag before using the
1382 // MOs in copies.
1383 Dest0->setIsEarlyClobber(false);
1384 Dest1->setIsEarlyClobber(false);
1385
1386 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1387 .add(*Dest0) // Copy to same destination including flags and sub reg.
1388 .addReg(DestReg, 0, SubRegIdx0);
1389 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1390 .add(*Dest1)
1391 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1392}
1393
1394// Return a register for the source of the merged store after copying the
1395// original source regs of CI and Paired into it.
1397SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1398 MachineBasicBlock::iterator InsertBefore,
1399 const DebugLoc &DL,
1400 AMDGPU::OpName OpName) const {
1401 MachineBasicBlock *MBB = CI.I->getParent();
1402
1403 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1404
1405 // Copy to the new source register.
1406 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1407 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1408
1409 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1410 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1411
1412 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1413 .add(*Src0)
1414 .addImm(SubRegIdx0)
1415 .add(*Src1)
1416 .addImm(SubRegIdx1);
1417
1418 return SrcReg;
1419}
1420
1421unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1422 if (STM->ldsRequiresM0Init())
1423 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1424 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1425}
1426
1427unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1428 if (STM->ldsRequiresM0Init())
1429 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1430
1431 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1432 : AMDGPU::DS_READ2ST64_B64_gfx9;
1433}
1434
1436SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1437 MachineBasicBlock::iterator InsertBefore) {
1438 MachineBasicBlock *MBB = CI.I->getParent();
1439
1440 // Be careful, since the addresses could be subregisters themselves in weird
1441 // cases, like vectors of pointers.
1442 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1443
1444 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1445 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1446 unsigned Opc =
1447 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1448
1449 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1450 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1451
1452 const MCInstrDesc &Read2Desc = TII->get(Opc);
1453
1454 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1455 Register DestReg = MRI->createVirtualRegister(SuperRC);
1456
1457 DebugLoc DL =
1458 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1459
1460 Register BaseReg = AddrReg->getReg();
1461 unsigned BaseSubReg = AddrReg->getSubReg();
1462 unsigned BaseRegFlags = 0;
1463 if (CI.BaseOff) {
1464 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1465 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1466 .addImm(CI.BaseOff);
1467
1468 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1469 BaseRegFlags = RegState::Kill;
1470
1471 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1472 .addReg(ImmReg)
1473 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1474 .addImm(0); // clamp bit
1475 BaseSubReg = 0;
1476 }
1477
1478 MachineInstrBuilder Read2 =
1479 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1480 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1481 .addImm(NewOffset0) // offset0
1482 .addImm(NewOffset1) // offset1
1483 .addImm(0) // gds
1484 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1485
1486 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
1487
1488 CI.I->eraseFromParent();
1489 Paired.I->eraseFromParent();
1490
1491 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1492 return Read2;
1493}
1494
1495unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1496 if (STM->ldsRequiresM0Init())
1497 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1498 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1499 : AMDGPU::DS_WRITE2_B64_gfx9;
1500}
1501
1502unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1503 if (STM->ldsRequiresM0Init())
1504 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1505 : AMDGPU::DS_WRITE2ST64_B64;
1506
1507 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1508 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1509}
1510
1511unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const {
1512 return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1513}
1514
1515MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1516 CombineInfo &CI, CombineInfo &Paired,
1517 MachineBasicBlock::iterator InsertBefore) {
1518 MachineBasicBlock *MBB = CI.I->getParent();
1519
1520 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1521 // sure we preserve the subregister index and any register flags set on them.
1522 const MachineOperand *AddrReg =
1523 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1524 const MachineOperand *Data0 =
1525 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1526 const MachineOperand *Data1 =
1527 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1528
1529 unsigned NewOffset0 = CI.Offset;
1530 unsigned NewOffset1 = Paired.Offset;
1531 unsigned Opc = getWrite2Opcode(CI);
1532
1533 if (NewOffset0 > NewOffset1) {
1534 // Canonicalize the merged instruction so the smaller offset comes first.
1535 std::swap(NewOffset0, NewOffset1);
1536 std::swap(Data0, Data1);
1537 }
1538
1539 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1540 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1541
1542 const MCInstrDesc &Write2Desc = TII->get(Opc);
1543 DebugLoc DL =
1544 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1545
1546 Register BaseReg = AddrReg->getReg();
1547 unsigned BaseSubReg = AddrReg->getSubReg();
1548 unsigned BaseRegFlags = 0;
1549 if (CI.BaseOff) {
1550 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1551 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1552 .addImm(CI.BaseOff);
1553
1554 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1555 BaseRegFlags = RegState::Kill;
1556
1557 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1558 .addReg(ImmReg)
1559 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1560 .addImm(0); // clamp bit
1561 BaseSubReg = 0;
1562 }
1563
1564 MachineInstrBuilder Write2 =
1565 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1566 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1567 .add(*Data0) // data0
1568 .add(*Data1) // data1
1569 .addImm(NewOffset0) // offset0
1570 .addImm(NewOffset1) // offset1
1571 .addImm(0) // gds
1572 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1573
1574 CI.I->eraseFromParent();
1575 Paired.I->eraseFromParent();
1576
1577 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1578 return Write2;
1579}
1580
1582SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1583 MachineBasicBlock::iterator InsertBefore) {
1584 MachineBasicBlock *MBB = CI.I->getParent();
1585 DebugLoc DL =
1586 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1587
1588 const unsigned Opcode = getNewOpcode(CI, Paired);
1589
1590 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1591
1592 Register DestReg = MRI->createVirtualRegister(SuperRC);
1593 unsigned MergedDMask = CI.DMask | Paired.DMask;
1594 unsigned DMaskIdx =
1595 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1596
1597 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1598 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1599 if (I == DMaskIdx)
1600 MIB.addImm(MergedDMask);
1601 else
1602 MIB.add((*CI.I).getOperand(I));
1603 }
1604
1605 // It shouldn't be possible to get this far if the two instructions
1606 // don't have a single memoperand, because MachineInstr::mayAlias()
1607 // will return true if this is the case.
1608 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1609
1610 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1611
1612 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
1613
1614 CI.I->eraseFromParent();
1615 Paired.I->eraseFromParent();
1616 return New;
1617}
1618
1619MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1620 CombineInfo &CI, CombineInfo &Paired,
1621 MachineBasicBlock::iterator InsertBefore) {
1622 MachineBasicBlock *MBB = CI.I->getParent();
1623 DebugLoc DL =
1624 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1625
1626 const unsigned Opcode = getNewOpcode(CI, Paired);
1627
1628 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1629
1630 Register DestReg = MRI->createVirtualRegister(SuperRC);
1631 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1632
1633 // It shouldn't be possible to get this far if the two instructions
1634 // don't have a single memoperand, because MachineInstr::mayAlias()
1635 // will return true if this is the case.
1636 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1637
1638 MachineInstrBuilder New =
1639 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1640 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1641 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1642 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1643 New.addImm(MergedOffset);
1644 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1645
1646 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);
1647
1648 CI.I->eraseFromParent();
1649 Paired.I->eraseFromParent();
1650 return New;
1651}
1652
1653MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1654 CombineInfo &CI, CombineInfo &Paired,
1655 MachineBasicBlock::iterator InsertBefore) {
1656 MachineBasicBlock *MBB = CI.I->getParent();
1657
1658 DebugLoc DL =
1659 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1660
1661 const unsigned Opcode = getNewOpcode(CI, Paired);
1662
1663 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1664
1665 // Copy to the new source register.
1666 Register DestReg = MRI->createVirtualRegister(SuperRC);
1667 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1668
1669 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1670
1671 AddressRegs Regs = getRegs(Opcode, *TII);
1672
1673 if (Regs.VAddr)
1674 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1675
1676 // It shouldn't be possible to get this far if the two instructions
1677 // don't have a single memoperand, because MachineInstr::mayAlias()
1678 // will return true if this is the case.
1679 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1680
1681 MachineInstr *New =
1682 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1683 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1684 .addImm(MergedOffset) // offset
1685 .addImm(CI.CPol) // cpol
1686 .addImm(0) // swz
1687 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1688
1689 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
1690
1691 CI.I->eraseFromParent();
1692 Paired.I->eraseFromParent();
1693 return New;
1694}
1695
1696MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1697 CombineInfo &CI, CombineInfo &Paired,
1698 MachineBasicBlock::iterator InsertBefore) {
1699 MachineBasicBlock *MBB = CI.I->getParent();
1700
1701 DebugLoc DL =
1702 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1703
1704 const unsigned Opcode = getNewOpcode(CI, Paired);
1705
1706 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1707
1708 // Copy to the new source register.
1709 Register DestReg = MRI->createVirtualRegister(SuperRC);
1710 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1711
1712 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1713
1714 AddressRegs Regs = getRegs(Opcode, *TII);
1715
1716 if (Regs.VAddr)
1717 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1718
1719 // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1720 // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1721 // and use XYZ of XYZW to enable the merge.
1722 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1723 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1724 NumCombinedComponents = 4;
1725 unsigned JoinedFormat =
1726 getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);
1727
1728 // It shouldn't be possible to get this far if the two instructions
1729 // don't have a single memoperand, because MachineInstr::mayAlias()
1730 // will return true if this is the case.
1731 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1732
1733 MachineInstr *New =
1734 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1735 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1736 .addImm(MergedOffset) // offset
1737 .addImm(JoinedFormat) // format
1738 .addImm(CI.CPol) // cpol
1739 .addImm(0) // swz
1740 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1741
1742 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
1743
1744 CI.I->eraseFromParent();
1745 Paired.I->eraseFromParent();
1746 return New;
1747}
1748
1749MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1750 CombineInfo &CI, CombineInfo &Paired,
1751 MachineBasicBlock::iterator InsertBefore) {
1752 MachineBasicBlock *MBB = CI.I->getParent();
1753 DebugLoc DL =
1754 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1755
1756 const unsigned Opcode = getNewOpcode(CI, Paired);
1757
1758 Register SrcReg =
1759 copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
1760
1761 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1762 .addReg(SrcReg, RegState::Kill);
1763
1764 AddressRegs Regs = getRegs(Opcode, *TII);
1765
1766 if (Regs.VAddr)
1767 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1768
1769 // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1770 // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1771 // and use XYZ of XYZW to enable the merge.
1772 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1773 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1774 NumCombinedComponents = 4;
1775 unsigned JoinedFormat =
1776 getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);
1777
1778 // It shouldn't be possible to get this far if the two instructions
1779 // don't have a single memoperand, because MachineInstr::mayAlias()
1780 // will return true if this is the case.
1781 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1782
1783 MachineInstr *New =
1784 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1785 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1786 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1787 .addImm(JoinedFormat) // format
1788 .addImm(CI.CPol) // cpol
1789 .addImm(0) // swz
1790 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1791
1792 CI.I->eraseFromParent();
1793 Paired.I->eraseFromParent();
1794 return New;
1795}
1796
1797MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1798 CombineInfo &CI, CombineInfo &Paired,
1799 MachineBasicBlock::iterator InsertBefore) {
1800 MachineBasicBlock *MBB = CI.I->getParent();
1801
1802 DebugLoc DL =
1803 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1804
1805 const unsigned Opcode = getNewOpcode(CI, Paired);
1806
1807 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1808 Register DestReg = MRI->createVirtualRegister(SuperRC);
1809
1810 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1811
1812 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1813 MIB.add(*SAddr);
1814
1815 MachineInstr *New =
1816 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1817 .addImm(std::min(CI.Offset, Paired.Offset))
1818 .addImm(CI.CPol)
1819 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1820
1821 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
1822
1823 CI.I->eraseFromParent();
1824 Paired.I->eraseFromParent();
1825 return New;
1826}
1827
1828MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1829 CombineInfo &CI, CombineInfo &Paired,
1830 MachineBasicBlock::iterator InsertBefore) {
1831 MachineBasicBlock *MBB = CI.I->getParent();
1832
1833 DebugLoc DL =
1834 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1835
1836 const unsigned Opcode = getNewOpcode(CI, Paired);
1837
1838 Register SrcReg =
1839 copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
1840
1841 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1842 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1843 .addReg(SrcReg, RegState::Kill);
1844
1845 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1846 MIB.add(*SAddr);
1847
1848 MachineInstr *New =
1849 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1850 .addImm(CI.CPol)
1851 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1852
1853 CI.I->eraseFromParent();
1854 Paired.I->eraseFromParent();
1855 return New;
1856}
1857
1860 unsigned Width) {
1861 // Conservatively returns true if not found the MMO.
1862 return STM.isXNACKEnabled() &&
1863 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1864}
1865
1866unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1867 const CombineInfo &Paired) {
1868 const unsigned Width = CI.Width + Paired.Width;
1869
1870 switch (getCommonInstClass(CI, Paired)) {
1871 default:
1872 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1873 // FIXME: Handle d16 correctly
1874 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1875 Width);
1876 case TBUFFER_LOAD:
1877 case TBUFFER_STORE:
1878 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1879 Width);
1880
1881 case UNKNOWN:
1882 llvm_unreachable("Unknown instruction class");
1883 case S_BUFFER_LOAD_IMM: {
1884 // If XNACK is enabled, use the constrained opcodes when the first load is
1885 // under-aligned.
1886 bool NeedsConstrainedOpc =
1887 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1888 switch (Width) {
1889 default:
1890 return 0;
1891 case 2:
1892 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1893 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1894 case 3:
1895 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1896 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1897 case 4:
1898 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1899 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1900 case 8:
1901 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1902 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1903 }
1904 }
1905 case S_BUFFER_LOAD_SGPR_IMM: {
1906 // If XNACK is enabled, use the constrained opcodes when the first load is
1907 // under-aligned.
1908 bool NeedsConstrainedOpc =
1909 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1910 switch (Width) {
1911 default:
1912 return 0;
1913 case 2:
1914 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1915 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1916 case 3:
1917 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1918 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1919 case 4:
1920 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1921 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1922 case 8:
1923 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1924 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1925 }
1926 }
1927 case S_LOAD_IMM: {
1928 // If XNACK is enabled, use the constrained opcodes when the first load is
1929 // under-aligned.
1930 bool NeedsConstrainedOpc =
1931 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1932 switch (Width) {
1933 default:
1934 return 0;
1935 case 2:
1936 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1937 : AMDGPU::S_LOAD_DWORDX2_IMM;
1938 case 3:
1939 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1940 : AMDGPU::S_LOAD_DWORDX3_IMM;
1941 case 4:
1942 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1943 : AMDGPU::S_LOAD_DWORDX4_IMM;
1944 case 8:
1945 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1946 : AMDGPU::S_LOAD_DWORDX8_IMM;
1947 }
1948 }
1949 case GLOBAL_LOAD:
1950 switch (Width) {
1951 default:
1952 return 0;
1953 case 2:
1954 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1955 case 3:
1956 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1957 case 4:
1958 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1959 }
1960 case GLOBAL_LOAD_SADDR:
1961 switch (Width) {
1962 default:
1963 return 0;
1964 case 2:
1965 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1966 case 3:
1967 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1968 case 4:
1969 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1970 }
1971 case GLOBAL_STORE:
1972 switch (Width) {
1973 default:
1974 return 0;
1975 case 2:
1976 return AMDGPU::GLOBAL_STORE_DWORDX2;
1977 case 3:
1978 return AMDGPU::GLOBAL_STORE_DWORDX3;
1979 case 4:
1980 return AMDGPU::GLOBAL_STORE_DWORDX4;
1981 }
1982 case GLOBAL_STORE_SADDR:
1983 switch (Width) {
1984 default:
1985 return 0;
1986 case 2:
1987 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1988 case 3:
1989 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1990 case 4:
1991 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1992 }
1993 case FLAT_LOAD:
1994 switch (Width) {
1995 default:
1996 return 0;
1997 case 2:
1998 return AMDGPU::FLAT_LOAD_DWORDX2;
1999 case 3:
2000 return AMDGPU::FLAT_LOAD_DWORDX3;
2001 case 4:
2002 return AMDGPU::FLAT_LOAD_DWORDX4;
2003 }
2004 case FLAT_STORE:
2005 switch (Width) {
2006 default:
2007 return 0;
2008 case 2:
2009 return AMDGPU::FLAT_STORE_DWORDX2;
2010 case 3:
2011 return AMDGPU::FLAT_STORE_DWORDX3;
2012 case 4:
2013 return AMDGPU::FLAT_STORE_DWORDX4;
2014 }
2015 case FLAT_LOAD_SADDR:
2016 switch (Width) {
2017 default:
2018 return 0;
2019 case 2:
2020 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
2021 case 3:
2022 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
2023 case 4:
2024 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
2025 }
2026 case FLAT_STORE_SADDR:
2027 switch (Width) {
2028 default:
2029 return 0;
2030 case 2:
2031 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
2032 case 3:
2033 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
2034 case 4:
2035 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
2036 }
2037 case MIMG:
2038 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
2039 "No overlaps");
2040 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
2041 }
2042}
2043
2044std::pair<unsigned, unsigned>
2045SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
2046 const CombineInfo &Paired) {
2047 assert((CI.InstClass != MIMG ||
2048 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
2049 CI.Width + Paired.Width)) &&
2050 "No overlaps");
2051
2052 unsigned Idx0;
2053 unsigned Idx1;
2054
2055 static const unsigned Idxs[5][4] = {
2056 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
2057 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
2058 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
2059 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
2060 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
2061 };
2062
2063 assert(CI.Width >= 1 && CI.Width <= 4);
2064 assert(Paired.Width >= 1 && Paired.Width <= 4);
2065
2066 if (Paired < CI) {
2067 Idx1 = Idxs[0][Paired.Width - 1];
2068 Idx0 = Idxs[Paired.Width][CI.Width - 1];
2069 } else {
2070 Idx0 = Idxs[0][CI.Width - 1];
2071 Idx1 = Idxs[CI.Width][Paired.Width - 1];
2072 }
2073
2074 return {Idx0, Idx1};
2075}
2076
2077const TargetRegisterClass *
2078SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
2079 const CombineInfo &Paired) const {
2080 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
2081 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
2082 switch (CI.Width + Paired.Width) {
2083 default:
2084 return nullptr;
2085 case 2:
2086 return &AMDGPU::SReg_64_XEXECRegClass;
2087 case 3:
2088 return &AMDGPU::SGPR_96RegClass;
2089 case 4:
2090 return &AMDGPU::SGPR_128RegClass;
2091 case 8:
2092 return &AMDGPU::SGPR_256RegClass;
2093 case 16:
2094 return &AMDGPU::SGPR_512RegClass;
2095 }
2096 }
2097
2098 // FIXME: This should compute the instruction to use, and then use the result
2099 // of TII->getRegClass.
2100 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
2101 return TRI->isAGPRClass(getDataRegClass(*CI.I))
2102 ? TRI->getAGPRClassForBitWidth(BitWidth)
2103 : TRI->getVGPRClassForBitWidth(BitWidth);
2104}
2105
2106MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
2107 CombineInfo &CI, CombineInfo &Paired,
2108 MachineBasicBlock::iterator InsertBefore) {
2109 MachineBasicBlock *MBB = CI.I->getParent();
2110 DebugLoc DL =
2111 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
2112
2113 const unsigned Opcode = getNewOpcode(CI, Paired);
2114
2115 Register SrcReg =
2116 copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
2117
2118 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
2119 .addReg(SrcReg, RegState::Kill);
2120
2121 AddressRegs Regs = getRegs(Opcode, *TII);
2122
2123 if (Regs.VAddr)
2124 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
2125
2126
2127 // It shouldn't be possible to get this far if the two instructions
2128 // don't have a single memoperand, because MachineInstr::mayAlias()
2129 // will return true if this is the case.
2130 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
2131
2132 MachineInstr *New =
2133 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
2134 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
2135 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
2136 .addImm(CI.CPol) // cpol
2137 .addImm(0) // swz
2138 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
2139
2140 CI.I->eraseFromParent();
2141 Paired.I->eraseFromParent();
2142 return New;
2143}
2144
2145MachineOperand
2146SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
2147 APInt V(32, Val, true);
2148 if (TII->isInlineConstant(V))
2149 return MachineOperand::CreateImm(Val);
2150
2151 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2152 MachineInstr *Mov =
2153 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
2154 TII->get(AMDGPU::S_MOV_B32), Reg)
2155 .addImm(Val);
2156 (void)Mov;
2157 LLVM_DEBUG(dbgs() << " "; Mov->dump());
2158 return MachineOperand::CreateReg(Reg, false);
2159}
2160
2161// Compute base address using Addr and return the final register.
2162Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
2163 const MemAddress &Addr) const {
2164 MachineBasicBlock *MBB = MI.getParent();
2165 MachineBasicBlock::iterator MBBI = MI.getIterator();
2166 DebugLoc DL = MI.getDebugLoc();
2167
2168 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2169 Addr.Base.LoSubReg) &&
2170 "Expected 32-bit Base-Register-Low!!");
2171
2172 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2173 Addr.Base.HiSubReg) &&
2174 "Expected 32-bit Base-Register-Hi!!");
2175
2176 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
2177 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
2178 MachineOperand OffsetHi =
2179 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
2180
2181 const auto *CarryRC = TRI->getWaveMaskRegClass();
2182 Register CarryReg = MRI->createVirtualRegister(CarryRC);
2183 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2184
2185 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2186 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2187 MachineInstr *LoHalf =
2188 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2189 .addReg(CarryReg, RegState::Define)
2190 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2191 .add(OffsetLo)
2192 .addImm(0); // clamp bit
2193 (void)LoHalf;
2194 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
2195
2196 MachineInstr *HiHalf =
2197 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2198 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2199 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2200 .add(OffsetHi)
2201 .addReg(CarryReg, RegState::Kill)
2202 .addImm(0); // clamp bit
2203 (void)HiHalf;
2204 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
2205
2206 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2207 MachineInstr *FullBase =
2208 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2209 .addReg(DestSub0)
2210 .addImm(AMDGPU::sub0)
2211 .addReg(DestSub1)
2212 .addImm(AMDGPU::sub1);
2213 (void)FullBase;
2214 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
2215
2216 return FullDestReg;
2217}
2218
2219// Update base and offset with the NewBase and NewOffset in MI.
2220void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2221 Register NewBase,
2222 int32_t NewOffset) const {
2223 auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2224 Base->setReg(NewBase);
2225 Base->setIsKill(false);
2226 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2227}
2228
2229std::optional<int32_t>
2230SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2231 if (Op.isImm())
2232 return Op.getImm();
2233
2234 if (!Op.isReg())
2235 return std::nullopt;
2236
2237 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2238 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2239 !Def->getOperand(1).isImm())
2240 return std::nullopt;
2241
2242 return Def->getOperand(1).getImm();
2243}
2244
2245// Analyze Base and extracts:
2246// - 32bit base registers, subregisters
2247// - 64bit constant offset
2248// Expecting base computation as:
2249// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2250// %LO:vgpr_32, %c:sreg_64_xexec =
2251// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2252// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2253// %Base:vreg_64 =
2254// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2255void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2256 MemAddress &Addr) const {
2257 if (!Base.isReg())
2258 return;
2259
2260 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2261 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2262 || Def->getNumOperands() != 5)
2263 return;
2264
2265 MachineOperand BaseLo = Def->getOperand(1);
2266 MachineOperand BaseHi = Def->getOperand(3);
2267 if (!BaseLo.isReg() || !BaseHi.isReg())
2268 return;
2269
2270 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2271 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2272
2273 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2274 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2275 return;
2276
2277 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2278 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2279
2280 auto Offset0P = extractConstOffset(*Src0);
2281 if (Offset0P)
2282 BaseLo = *Src1;
2283 else {
2284 if (!(Offset0P = extractConstOffset(*Src1)))
2285 return;
2286 BaseLo = *Src0;
2287 }
2288
2289 if (!BaseLo.isReg())
2290 return;
2291
2292 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2293 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2294
2295 if (Src0->isImm())
2296 std::swap(Src0, Src1);
2297
2298 if (!Src1->isImm() || Src0->isImm())
2299 return;
2300
2301 uint64_t Offset1 = Src1->getImm();
2302 BaseHi = *Src0;
2303
2304 if (!BaseHi.isReg())
2305 return;
2306
2307 Addr.Base.LoReg = BaseLo.getReg();
2308 Addr.Base.HiReg = BaseHi.getReg();
2309 Addr.Base.LoSubReg = BaseLo.getSubReg();
2310 Addr.Base.HiSubReg = BaseHi.getSubReg();
2311 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2312}
2313
2314bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2315 MachineInstr &MI,
2316 MemInfoMap &Visited,
2317 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2318
2319 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2320 return false;
2321
2322 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2324 return false;
2325
2328
2329 if (AnchorList.count(&MI))
2330 return false;
2331
2332 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2333
2334 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2335 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2336 return false;
2337 }
2338
2339 // Step1: Find the base-registers and a 64bit constant offset.
2340 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2341 auto [It, Inserted] = Visited.try_emplace(&MI);
2342 MemAddress MAddr;
2343 if (Inserted) {
2344 processBaseWithConstOffset(Base, MAddr);
2345 It->second = MAddr;
2346 } else
2347 MAddr = It->second;
2348
2349 if (MAddr.Offset == 0) {
2350 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2351 " constant offsets that can be promoted.\n";);
2352 return false;
2353 }
2354
2355 LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", "
2356 << printReg(MAddr.Base.LoReg, TRI)
2357 << "} Offset: " << MAddr.Offset << "\n\n";);
2358
2359 // Step2: Traverse through MI's basic block and find an anchor(that has the
2360 // same base-registers) with the highest 13bit distance from MI's offset.
2361 // E.g. (64bit loads)
2362 // bb:
2363 // addr1 = &a + 4096; load1 = load(addr1, 0)
2364 // addr2 = &a + 6144; load2 = load(addr2, 0)
2365 // addr3 = &a + 8192; load3 = load(addr3, 0)
2366 // addr4 = &a + 10240; load4 = load(addr4, 0)
2367 // addr5 = &a + 12288; load5 = load(addr5, 0)
2368 //
2369 // Starting from the first load, the optimization will try to find a new base
2370 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2371 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2372 // as the new-base(anchor) because of the maximum distance which can
2373 // accommodate more intermediate bases presumably.
2374 //
2375 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2376 // (&a + 8192) for load1, load2, load4.
2377 // addr = &a + 8192
2378 // load1 = load(addr, -4096)
2379 // load2 = load(addr, -2048)
2380 // load3 = load(addr, 0)
2381 // load4 = load(addr, 2048)
2382 // addr5 = &a + 12288; load5 = load(addr5, 0)
2383 //
2384 MachineInstr *AnchorInst = nullptr;
2385 MemAddress AnchorAddr;
2386 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2388
2389 MachineBasicBlock *MBB = MI.getParent();
2391 MachineBasicBlock::iterator MBBI = MI.getIterator();
2392 ++MBBI;
2393 const SITargetLowering *TLI = STM->getTargetLowering();
2394
2395 for ( ; MBBI != E; ++MBBI) {
2396 MachineInstr &MINext = *MBBI;
2397 // TODO: Support finding an anchor(with same base) from store addresses or
2398 // any other load addresses where the opcodes are different.
2399 if (MINext.getOpcode() != MI.getOpcode() ||
2400 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2401 continue;
2402
2403 const MachineOperand &BaseNext =
2404 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2405 MemAddress MAddrNext;
2406 auto [It, Inserted] = Visited.try_emplace(&MINext);
2407 if (Inserted) {
2408 processBaseWithConstOffset(BaseNext, MAddrNext);
2409 It->second = MAddrNext;
2410 } else
2411 MAddrNext = It->second;
2412
2413 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2414 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2415 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2416 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2417 continue;
2418
2419 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2420
2421 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2422 TargetLoweringBase::AddrMode AM;
2423 AM.HasBaseReg = true;
2424 AM.BaseOffs = Dist;
2425 if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2426 (uint32_t)std::abs(Dist) > MaxDist) {
2427 MaxDist = std::abs(Dist);
2428
2429 AnchorAddr = MAddrNext;
2430 AnchorInst = &MINext;
2431 }
2432 }
2433
2434 if (AnchorInst) {
2435 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2436 AnchorInst->dump());
2437 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2438 << AnchorAddr.Offset << "\n\n");
2439
2440 // Instead of moving up, just re-compute anchor-instruction's base address.
2441 Register Base = computeBase(MI, AnchorAddr);
2442
2443 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2444 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2445
2446 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2447 TargetLoweringBase::AddrMode AM;
2448 AM.HasBaseReg = true;
2449 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2450
2451 if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2452 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2453 OtherMI->dump());
2454 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2455 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2456 }
2457 }
2458 AnchorList.insert(AnchorInst);
2459 return true;
2460 }
2461
2462 return false;
2463}
2464
2465void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2466 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2467 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2468 if (AddrList.front().InstClass == CI.InstClass &&
2469 AddrList.front().hasSameBaseAddress(CI)) {
2470 AddrList.emplace_back(CI);
2471 return;
2472 }
2473 }
2474
2475 // Base address not found, so add a new list.
2476 MergeableInsts.emplace_back(1, CI);
2477}
2478
2479std::pair<MachineBasicBlock::iterator, bool>
2480SILoadStoreOptimizer::collectMergeableInsts(
2482 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2483 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2484 bool Modified = false;
2485
2486 // Sort potential mergeable instructions into lists. One list per base address.
2487 unsigned Order = 0;
2488 MachineBasicBlock::iterator BlockI = Begin;
2489 for (; BlockI != End; ++BlockI) {
2490 MachineInstr &MI = *BlockI;
2491
2492 // We run this before checking if an address is mergeable, because it can produce
2493 // better code even if the instructions aren't mergeable.
2494 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2495 Modified = true;
2496
2497 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2498 // barriers. We can look after this barrier for separate merges.
2499 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2500 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2501
2502 // Search will resume after this instruction in a separate merge list.
2503 ++BlockI;
2504 break;
2505 }
2506
2507 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2508 if (InstClass == UNKNOWN)
2509 continue;
2510
2511 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2512 int Swizzled =
2513 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2514 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2515 continue;
2516
2517 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2518 const MachineOperand *Fmt =
2519 TII->getNamedOperand(MI, AMDGPU::OpName::format);
2520 if (!AMDGPU::getGcnBufferFormatInfo(Fmt->getImm(), *STM)) {
2521 LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);
2522 continue;
2523 }
2524 }
2525
2526 CombineInfo CI;
2527 CI.setMI(MI, *this);
2528 CI.Order = Order++;
2529
2530 if (!CI.hasMergeableAddress(*MRI))
2531 continue;
2532
2533 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2534
2535 addInstToMergeableList(CI, MergeableInsts);
2536 }
2537
2538 // At this point we have lists of Mergeable instructions.
2539 //
2540 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2541 // list try to find an instruction that can be merged with I. If an instruction
2542 // is found, it is stored in the Paired field. If no instructions are found, then
2543 // the CombineInfo object is deleted from the list.
2544
2545 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2546 E = MergeableInsts.end(); I != E;) {
2547
2548 std::list<CombineInfo> &MergeList = *I;
2549 if (MergeList.size() <= 1) {
2550 // This means we have found only one instruction with a given address
2551 // that can be merged, and we need at least 2 instructions to do a merge,
2552 // so this list can be discarded.
2553 I = MergeableInsts.erase(I);
2554 continue;
2555 }
2556
2557 // Sort the lists by offsets, this way mergeable instructions will be
2558 // adjacent to each other in the list, which will make it easier to find
2559 // matches.
2560 MergeList.sort(
2561 [] (const CombineInfo &A, const CombineInfo &B) {
2562 return A.Offset < B.Offset;
2563 });
2564 ++I;
2565 }
2566
2567 return {BlockI, Modified};
2568}
2569
2570// Scan through looking for adjacent LDS operations with constant offsets from
2571// the same base register. We rely on the scheduler to do the hard work of
2572// clustering nearby loads, and assume these are all adjacent.
2573bool SILoadStoreOptimizer::optimizeBlock(
2574 std::list<std::list<CombineInfo> > &MergeableInsts) {
2575 bool Modified = false;
2576
2577 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2578 E = MergeableInsts.end(); I != E;) {
2579 std::list<CombineInfo> &MergeList = *I;
2580
2581 bool OptimizeListAgain = false;
2582 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2583 // We weren't able to make any changes, so delete the list so we don't
2584 // process the same instructions the next time we try to optimize this
2585 // block.
2586 I = MergeableInsts.erase(I);
2587 continue;
2588 }
2589
2590 Modified = true;
2591
2592 // We made changes, but also determined that there were no more optimization
2593 // opportunities, so we don't need to reprocess the list
2594 if (!OptimizeListAgain) {
2595 I = MergeableInsts.erase(I);
2596 continue;
2597 }
2598 OptimizeAgain = true;
2599 }
2600 return Modified;
2601}
2602
2603bool
2604SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2605 std::list<CombineInfo> &MergeList,
2606 bool &OptimizeListAgain) {
2607 if (MergeList.empty())
2608 return false;
2609
2610 bool Modified = false;
2611
2612 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2613 Next = std::next(I)) {
2614
2615 auto First = I;
2616 auto Second = Next;
2617
2618 if ((*First).Order > (*Second).Order)
2619 std::swap(First, Second);
2620 CombineInfo &CI = *First;
2621 CombineInfo &Paired = *Second;
2622
2623 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2624 if (!Where) {
2625 ++I;
2626 continue;
2627 }
2628
2629 Modified = true;
2630
2631 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2632
2634 switch (CI.InstClass) {
2635 default:
2636 llvm_unreachable("unknown InstClass");
2637 break;
2638 case DS_READ:
2639 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2640 break;
2641 case DS_WRITE:
2642 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2643 break;
2644 case S_BUFFER_LOAD_IMM:
2645 case S_BUFFER_LOAD_SGPR_IMM:
2646 case S_LOAD_IMM:
2647 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2648 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2649 break;
2650 case BUFFER_LOAD:
2651 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2652 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2653 break;
2654 case BUFFER_STORE:
2655 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2656 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2657 break;
2658 case MIMG:
2659 NewMI = mergeImagePair(CI, Paired, Where->I);
2660 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2661 break;
2662 case TBUFFER_LOAD:
2663 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2664 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2665 break;
2666 case TBUFFER_STORE:
2667 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2668 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2669 break;
2670 case FLAT_LOAD:
2671 case FLAT_LOAD_SADDR:
2672 case GLOBAL_LOAD:
2673 case GLOBAL_LOAD_SADDR:
2674 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2675 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2676 break;
2677 case FLAT_STORE:
2678 case FLAT_STORE_SADDR:
2679 case GLOBAL_STORE:
2680 case GLOBAL_STORE_SADDR:
2681 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2682 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2683 break;
2684 }
2685 CI.setMI(NewMI, *this);
2686 CI.Order = Where->Order;
2687 if (I == Second)
2688 I = Next;
2689
2690 MergeList.erase(Second);
2691 }
2692
2693 return Modified;
2694}
2695
2696bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2697 if (skipFunction(MF.getFunction()))
2698 return false;
2699 return SILoadStoreOptimizer(
2700 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2701 .run(MF);
2702}
2703
2704bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2705 this->MF = &MF;
2706 STM = &MF.getSubtarget<GCNSubtarget>();
2707 if (!STM->loadStoreOptEnabled())
2708 return false;
2709
2710 TII = STM->getInstrInfo();
2711 TRI = &TII->getRegisterInfo();
2712
2713 MRI = &MF.getRegInfo();
2714
2715 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2716
2717 bool Modified = false;
2718
2719 // Contains the list of instructions for which constant offsets are being
2720 // promoted to the IMM. This is tracked for an entire block at time.
2721 SmallPtrSet<MachineInstr *, 4> AnchorList;
2722 MemInfoMap Visited;
2723
2724 for (MachineBasicBlock &MBB : MF) {
2725 MachineBasicBlock::iterator SectionEnd;
2726 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2727 I = SectionEnd) {
2728 bool CollectModified;
2729 std::list<std::list<CombineInfo>> MergeableInsts;
2730
2731 // First pass: Collect list of all instructions we know how to merge in a
2732 // subset of the block.
2733 std::tie(SectionEnd, CollectModified) =
2734 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2735
2736 Modified |= CollectModified;
2737
2738 do {
2739 OptimizeAgain = false;
2740 Modified |= optimizeBlock(MergeableInsts);
2741 } while (OptimizeAgain);
2742 }
2743
2744 Visited.clear();
2745 AnchorList.clear();
2746 }
2747
2748 return Modified;
2749}
2750
2751PreservedAnalyses
2754 MFPropsModifier _(*this, MF);
2755
2756 if (MF.getFunction().hasOptNone())
2757 return PreservedAnalyses::all();
2758
2760 .getManager();
2761 AAResults &AA = FAM.getResult<AAManager>(MF.getFunction());
2762
2763 bool Changed = SILoadStoreOptimizer(&AA).run(MF);
2764 if (!Changed)
2765 return PreservedAnalyses::all();
2766
2769 return PA;
2770}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
#define op(i)
const HexagonInstrInfo * TII
#define _
static MaybeAlign getAlign(Value *Ptr)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
#define LLVM_DEBUG(...)
Definition Debug.h:114
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
A debug info location.
Definition DebugLoc.h:124
static LLVM_ABI DebugLoc getMergedLocation(DebugLoc LocA, DebugLoc LocB)
When two instructions are combined into a single instruction we also need to combine the original loc...
Definition DebugLoc.cpp:181
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasOptNone() const
Do not optimize this function (-O0).
Definition Function.h:700
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
const HexagonRegisterInfo & getRegisterInfo() const
TypeSize getValue() const
unsigned getOpcode() const
Return the opcode number for this descriptor.
An RAII based helper class to modify MachineFunctionProperties when running pass.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void dump() const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void dump() const
Definition Pass.cpp:146
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static bool isFLATScratch(const MachineInstr &MI)
static bool isVIMAGE(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:362
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr T maskLeadingOnes(unsigned N)
Create a bitmask with the N left-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:88
FunctionPass * createSILoadStoreOptimizerLegacyPass()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
char & SILoadStoreOptimizerLegacyID
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
constexpr unsigned BitWidth
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869