Line data Source code
1 : //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This pass tries to fuse DS instructions with close by immediate offsets.
11 : // This will fuse operations such as
12 : // ds_read_b32 v0, v2 offset:16
13 : // ds_read_b32 v1, v2 offset:32
14 : // ==>
15 : // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 : //
17 : // The same is done for certain SMEM and VMEM opcodes, e.g.:
18 : // s_buffer_load_dword s4, s[0:3], 4
19 : // s_buffer_load_dword s5, s[0:3], 8
20 : // ==>
21 : // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 : //
23 : //
24 : // Future improvements:
25 : //
26 : // - This currently relies on the scheduler to place loads and stores next to
27 : // each other, and then only merges adjacent pairs of instructions. It would
28 : // be good to be more flexible with interleaved instructions, and possibly run
29 : // before scheduling. It currently missing stores of constants because loading
30 : // the constant into the data register is placed between the stores, although
31 : // this is arguably a scheduling problem.
32 : //
33 : // - Live interval recomputing seems inefficient. This currently only matches
34 : // one pair, and recomputes live intervals and moves on to the next pair. It
35 : // would be better to compute a list of all merges that need to occur.
36 : //
37 : // - With a list of instructions to process, we can also merge more. If a
38 : // cluster of loads have offsets that are too large to fit in the 8-bit
39 : // offsets, but are close enough to fit in the 8 bits, we can add to the base
40 : // pointer and use the new reduced offsets.
41 : //
42 : //===----------------------------------------------------------------------===//
43 :
44 : #include "AMDGPU.h"
45 : #include "AMDGPUSubtarget.h"
46 : #include "SIInstrInfo.h"
47 : #include "SIRegisterInfo.h"
48 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
49 : #include "Utils/AMDGPUBaseInfo.h"
50 : #include "llvm/ADT/ArrayRef.h"
51 : #include "llvm/ADT/SmallVector.h"
52 : #include "llvm/ADT/StringRef.h"
53 : #include "llvm/Analysis/AliasAnalysis.h"
54 : #include "llvm/CodeGen/MachineBasicBlock.h"
55 : #include "llvm/CodeGen/MachineFunction.h"
56 : #include "llvm/CodeGen/MachineFunctionPass.h"
57 : #include "llvm/CodeGen/MachineInstr.h"
58 : #include "llvm/CodeGen/MachineInstrBuilder.h"
59 : #include "llvm/CodeGen/MachineOperand.h"
60 : #include "llvm/CodeGen/MachineRegisterInfo.h"
61 : #include "llvm/IR/DebugLoc.h"
62 : #include "llvm/Pass.h"
63 : #include "llvm/Support/Debug.h"
64 : #include "llvm/Support/MathExtras.h"
65 : #include "llvm/Support/raw_ostream.h"
66 : #include <algorithm>
67 : #include <cassert>
68 : #include <cstdlib>
69 : #include <iterator>
70 : #include <utility>
71 :
72 : using namespace llvm;
73 :
74 : #define DEBUG_TYPE "si-load-store-opt"
75 :
76 : namespace {
77 :
78 : class SILoadStoreOptimizer : public MachineFunctionPass {
79 : enum InstClassEnum {
80 : DS_READ_WRITE,
81 : S_BUFFER_LOAD_IMM,
82 : BUFFER_LOAD_OFFEN,
83 : BUFFER_LOAD_OFFSET,
84 : BUFFER_STORE_OFFEN,
85 : BUFFER_STORE_OFFSET,
86 : };
87 :
88 : struct CombineInfo {
89 : MachineBasicBlock::iterator I;
90 : MachineBasicBlock::iterator Paired;
91 : unsigned EltSize;
92 : unsigned Offset0;
93 : unsigned Offset1;
94 : unsigned BaseOff;
95 : InstClassEnum InstClass;
96 : bool GLC0;
97 : bool GLC1;
98 : bool SLC0;
99 : bool SLC1;
100 : bool UseST64;
101 : bool IsX2;
102 : SmallVector<MachineInstr*, 8> InstsToMove;
103 : };
104 :
105 : private:
106 : const GCNSubtarget *STM = nullptr;
107 : const SIInstrInfo *TII = nullptr;
108 : const SIRegisterInfo *TRI = nullptr;
109 : MachineRegisterInfo *MRI = nullptr;
110 : AliasAnalysis *AA = nullptr;
111 : unsigned CreatedX2;
112 :
113 : static bool offsetsCanBeCombined(CombineInfo &CI);
114 :
115 : bool findMatchingInst(CombineInfo &CI);
116 :
117 : unsigned read2Opcode(unsigned EltSize) const;
118 : unsigned read2ST64Opcode(unsigned EltSize) const;
119 : MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
120 :
121 : unsigned write2Opcode(unsigned EltSize) const;
122 : unsigned write2ST64Opcode(unsigned EltSize) const;
123 : MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
124 : MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
125 : MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
126 : unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
127 : bool &IsOffen) const;
128 : MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
129 :
130 : public:
131 : static char ID;
132 :
133 1923 : SILoadStoreOptimizer() : MachineFunctionPass(ID) {
134 1923 : initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
135 1923 : }
136 :
137 : bool optimizeBlock(MachineBasicBlock &MBB);
138 :
139 : bool runOnMachineFunction(MachineFunction &MF) override;
140 :
141 1909 : StringRef getPassName() const override { return "SI Load Store Optimizer"; }
142 :
143 1909 : void getAnalysisUsage(AnalysisUsage &AU) const override {
144 1909 : AU.setPreservesCFG();
145 : AU.addRequired<AAResultsWrapperPass>();
146 :
147 1909 : MachineFunctionPass::getAnalysisUsage(AU);
148 1909 : }
149 : };
150 :
151 : } // end anonymous namespace.
152 :
153 85105 : INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
154 : "SI Load Store Optimizer", false, false)
155 85105 : INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
156 200947 : INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
157 : "SI Load Store Optimizer", false, false)
158 :
159 : char SILoadStoreOptimizer::ID = 0;
160 :
161 : char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
162 :
163 0 : FunctionPass *llvm::createSILoadStoreOptimizerPass() {
164 0 : return new SILoadStoreOptimizer();
165 : }
166 :
167 3432 : static void moveInstsAfter(MachineBasicBlock::iterator I,
168 : ArrayRef<MachineInstr*> InstsToMove) {
169 3432 : MachineBasicBlock *MBB = I->getParent();
170 : ++I;
171 4493 : for (MachineInstr *MI : InstsToMove) {
172 1061 : MI->removeFromParent();
173 : MBB->insert(I, MI);
174 : }
175 3432 : }
176 :
177 8291 : static void addDefsUsesToList(const MachineInstr &MI,
178 : DenseSet<unsigned> &RegDefs,
179 : DenseSet<unsigned> &PhysRegUses) {
180 57523 : for (const MachineOperand &Op : MI.operands()) {
181 49232 : if (Op.isReg()) {
182 28474 : if (Op.isDef())
183 4621 : RegDefs.insert(Op.getReg());
184 23850 : else if (Op.readsReg() &&
185 23850 : TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
186 9658 : PhysRegUses.insert(Op.getReg());
187 : }
188 : }
189 8291 : }
190 :
191 17105 : static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
192 : MachineBasicBlock::iterator B,
193 : const SIInstrInfo *TII,
194 : AliasAnalysis * AA) {
195 : // RAW or WAR - cannot reorder
196 : // WAW - cannot reorder
197 : // RAR - safe to reorder
198 19703 : return !(A->mayStore() || B->mayStore()) ||
199 2598 : TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
200 : }
201 :
202 : // Add MI and its defs to the lists if MI reads one of the defs that are
203 : // already in the list. Returns true in that case.
204 : static bool
205 61380 : addToListsIfDependent(MachineInstr &MI,
206 : DenseSet<unsigned> &RegDefs,
207 : DenseSet<unsigned> &PhysRegUses,
208 : SmallVectorImpl<MachineInstr*> &Insts) {
209 370106 : for (MachineOperand &Use : MI.operands()) {
210 : // If one of the defs is read, then there is a use of Def between I and the
211 : // instruction that I will potentially be merged with. We will need to move
212 : // this instruction after the merged instructions.
213 : //
214 : // Similarly, if there is a def which is read by an instruction that is to
215 : // be moved for merging, then we need to move the def-instruction as well.
216 : // This can only happen for physical registers such as M0; virtual
217 : // registers are in SSA form.
218 311487 : if (Use.isReg() &&
219 318708 : ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
220 113944 : (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
221 309536 : PhysRegUses.count(Use.getReg())))) {
222 2761 : Insts.push_back(&MI);
223 2761 : addDefsUsesToList(MI, RegDefs, PhysRegUses);
224 2761 : return true;
225 : }
226 : }
227 :
228 : return false;
229 : }
230 :
231 : static bool
232 19619 : canMoveInstsAcrossMemOp(MachineInstr &MemOp,
233 : ArrayRef<MachineInstr*> InstsToMove,
234 : const SIInstrInfo *TII,
235 : AliasAnalysis *AA) {
236 : assert(MemOp.mayLoadOrStore());
237 :
238 43395 : for (MachineInstr *InstToMove : InstsToMove) {
239 23824 : if (!InstToMove->mayLoadOrStore())
240 : continue;
241 74 : if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
242 : return false;
243 : }
244 : return true;
245 : }
246 :
247 7166 : bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
248 : // XXX - Would the same offset be OK? Is there any reason this would happen or
249 : // be useful?
250 7166 : if (CI.Offset0 == CI.Offset1)
251 : return false;
252 :
253 : // This won't be valid if the offset isn't aligned.
254 7142 : if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
255 : return false;
256 :
257 7142 : unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
258 7142 : unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
259 7142 : CI.UseST64 = false;
260 7142 : CI.BaseOff = 0;
261 :
262 : // Handle SMEM and VMEM instructions.
263 7142 : if (CI.InstClass != DS_READ_WRITE) {
264 4040 : unsigned Diff = CI.IsX2 ? 2 : 1;
265 7727 : return (EltOffset0 + Diff == EltOffset1 ||
266 3687 : EltOffset1 + Diff == EltOffset0) &&
267 4040 : CI.GLC0 == CI.GLC1 &&
268 4040 : (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
269 : }
270 :
271 : // If the offset in elements doesn't fit in 8-bits, we might be able to use
272 : // the stride 64 versions.
273 280 : if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
274 3144 : isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
275 42 : CI.Offset0 = EltOffset0 / 64;
276 42 : CI.Offset1 = EltOffset1 / 64;
277 42 : CI.UseST64 = true;
278 42 : return true;
279 : }
280 :
281 : // Check if the new offsets fit in the reduced 8-bit range.
282 3060 : if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
283 2990 : CI.Offset0 = EltOffset0;
284 2990 : CI.Offset1 = EltOffset1;
285 2990 : return true;
286 : }
287 :
288 : // Try to shift base address to decrease offsets.
289 70 : unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
290 70 : CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
291 :
292 70 : if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
293 24 : CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
294 24 : CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
295 24 : CI.UseST64 = true;
296 24 : return true;
297 : }
298 :
299 46 : if (isUInt<8>(OffsetDiff)) {
300 37 : CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
301 37 : CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
302 37 : return true;
303 : }
304 :
305 : return false;
306 : }
307 :
308 17465 : bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
309 17465 : MachineBasicBlock *MBB = CI.I->getParent();
310 : MachineBasicBlock::iterator E = MBB->end();
311 : MachineBasicBlock::iterator MBBI = CI.I;
312 :
313 17465 : unsigned AddrOpName[3] = {0};
314 : int AddrIdx[3];
315 : const MachineOperand *AddrReg[3];
316 : unsigned NumAddresses = 0;
317 :
318 17465 : switch (CI.InstClass) {
319 4503 : case DS_READ_WRITE:
320 4503 : AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
321 4503 : break;
322 567 : case S_BUFFER_LOAD_IMM:
323 567 : AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
324 567 : break;
325 5518 : case BUFFER_LOAD_OFFEN:
326 : case BUFFER_STORE_OFFEN:
327 5518 : AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
328 5518 : AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
329 5518 : AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
330 5518 : break;
331 6877 : case BUFFER_LOAD_OFFSET:
332 : case BUFFER_STORE_OFFSET:
333 6877 : AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
334 6877 : AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
335 6877 : break;
336 : }
337 :
338 23096 : for (unsigned i = 0; i < NumAddresses; i++) {
339 36846 : AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
340 36846 : AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
341 :
342 : // We only ever merge operations with the same base address register, so don't
343 : // bother scanning forward if there are no other uses.
344 18423 : if (AddrReg[i]->isReg() &&
345 29903 : (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
346 12242 : MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
347 12792 : return false;
348 : }
349 :
350 : ++MBBI;
351 :
352 : DenseSet<unsigned> RegDefsToMove;
353 : DenseSet<unsigned> PhysRegUsesToMove;
354 4673 : addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
355 :
356 63443 : for ( ; MBBI != E; ++MBBI) {
357 186867 : if (MBBI->getOpcode() != CI.I->getOpcode()) {
358 : // This is not a matching DS instruction, but we can keep looking as
359 : // long as one of these conditions are met:
360 : // 1. It is safe to move I down past MBBI.
361 : // 2. It is safe to move MBBI down past the instruction that I will
362 : // be merged into.
363 :
364 55051 : if (MBBI->hasUnmodeledSideEffects()) {
365 : // We can't re-order this instruction with respect to other memory
366 : // operations, so we fail both conditions mentioned above.
367 : return false;
368 : }
369 :
370 68268 : if (MBBI->mayLoadOrStore() &&
371 38960 : (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
372 24868 : !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
373 : // We fail condition #1, but we may still be able to satisfy condition
374 : // #2. Add this instruction to the move list and then we will check
375 : // if condition #2 holds once we have selected the matching instruction.
376 857 : CI.InstsToMove.push_back(&*MBBI);
377 857 : addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
378 857 : continue;
379 : }
380 :
381 : // When we match I with another DS instruction we will be moving I down
382 : // to the location of the matched instruction any uses of I will need to
383 : // be moved down as well.
384 54148 : addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
385 : CI.InstsToMove);
386 54148 : continue;
387 : }
388 :
389 : // Don't merge volatiles.
390 7238 : if (MBBI->hasOrderedMemoryRef())
391 : return false;
392 :
393 : // Handle a case like
394 : // DS_WRITE_B32 addr, v, idx0
395 : // w = DS_READ_B32 addr, idx0
396 : // DS_WRITE_B32 addr, f(w), idx1
397 : // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
398 : // merging of the two writes.
399 7232 : if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
400 : CI.InstsToMove))
401 : continue;
402 :
403 : bool Match = true;
404 15544 : for (unsigned i = 0; i < NumAddresses; i++) {
405 8378 : const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
406 :
407 16756 : if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
408 1066 : if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
409 1066 : AddrReg[i]->getImm() != AddrRegNext.getImm()) {
410 : Match = false;
411 : break;
412 : }
413 : continue;
414 : }
415 :
416 : // Check same base pointer. Be careful of subregisters, which can occur with
417 : // vectors of pointers.
418 7312 : if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
419 : AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
420 : Match = false;
421 : break;
422 : }
423 : }
424 :
425 7200 : if (Match) {
426 7166 : int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
427 : AMDGPU::OpName::offset);
428 7166 : CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
429 7166 : CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
430 7166 : CI.Paired = MBBI;
431 :
432 7166 : if (CI.InstClass == DS_READ_WRITE) {
433 3118 : CI.Offset0 &= 0xffff;
434 3118 : CI.Offset1 &= 0xffff;
435 : } else {
436 4048 : CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
437 4048 : CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
438 4048 : if (CI.InstClass != S_BUFFER_LOAD_IMM) {
439 1066 : CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
440 1066 : CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
441 : }
442 : }
443 :
444 : // Check both offsets fit in the reduced range.
445 : // We also need to go through the list of instructions that we plan to
446 : // move and make sure they are all safe to move down past the merged
447 : // instruction.
448 7166 : if (offsetsCanBeCombined(CI))
449 6876 : if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
450 : return true;
451 : }
452 :
453 : // We've found a load/store that we couldn't merge for some reason.
454 : // We could potentially keep looking, but we'd need to make sure that
455 : // it was safe to move I and also all the instruction in InstsToMove
456 : // down past this instruction.
457 : // check if we can move I across MBBI and if we can move all I's users
458 11283 : if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
459 11262 : !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
460 : break;
461 : }
462 : return false;
463 : }
464 :
465 0 : unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
466 949 : if (STM->ldsRequiresM0Init())
467 756 : return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
468 193 : return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
469 : }
470 :
471 0 : unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
472 38 : if (STM->ldsRequiresM0Init())
473 22 : return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
474 :
475 16 : return (EltSize == 4) ?
476 : AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
477 : }
478 :
479 987 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
480 : CombineInfo &CI) {
481 987 : MachineBasicBlock *MBB = CI.I->getParent();
482 :
483 : // Be careful, since the addresses could be subregisters themselves in weird
484 : // cases, like vectors of pointers.
485 987 : const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
486 :
487 987 : const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
488 987 : const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
489 :
490 987 : unsigned NewOffset0 = CI.Offset0;
491 987 : unsigned NewOffset1 = CI.Offset1;
492 987 : unsigned Opc = CI.UseST64 ?
493 987 : read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
494 :
495 987 : unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
496 987 : unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
497 :
498 987 : if (NewOffset0 > NewOffset1) {
499 : // Canonicalize the merged instruction so the smaller offset comes first.
500 : std::swap(NewOffset0, NewOffset1);
501 : std::swap(SubRegIdx0, SubRegIdx1);
502 : }
503 :
504 : assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
505 : (NewOffset0 != NewOffset1) &&
506 : "Computed offset doesn't fit");
507 :
508 987 : const MCInstrDesc &Read2Desc = TII->get(Opc);
509 :
510 : const TargetRegisterClass *SuperRC
511 987 : = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
512 1974 : unsigned DestReg = MRI->createVirtualRegister(SuperRC);
513 :
514 : DebugLoc DL = CI.I->getDebugLoc();
515 :
516 987 : unsigned BaseReg = AddrReg->getReg();
517 : unsigned BaseSubReg = AddrReg->getSubReg();
518 : unsigned BaseRegFlags = 0;
519 987 : if (CI.BaseOff) {
520 62 : unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
521 62 : BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
522 31 : .addImm(CI.BaseOff);
523 :
524 62 : BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
525 : BaseRegFlags = RegState::Kill;
526 :
527 62 : TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
528 31 : .addReg(ImmReg)
529 31 : .addReg(AddrReg->getReg(), 0, BaseSubReg);
530 : BaseSubReg = 0;
531 : }
532 :
533 987 : MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
534 987 : .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
535 987 : .addImm(NewOffset0) // offset0
536 987 : .addImm(NewOffset1) // offset1
537 : .addImm(0) // gds
538 1974 : .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
539 :
540 : (void)Read2;
541 :
542 987 : const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
543 :
544 : // Copy to the old destination registers.
545 987 : BuildMI(*MBB, CI.Paired, DL, CopyDesc)
546 : .add(*Dest0) // Copy to same destination including flags and sub reg.
547 987 : .addReg(DestReg, 0, SubRegIdx0);
548 987 : MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
549 : .add(*Dest1)
550 987 : .addReg(DestReg, RegState::Kill, SubRegIdx1);
551 :
552 987 : moveInstsAfter(Copy1, CI.InstsToMove);
553 :
554 987 : MachineBasicBlock::iterator Next = std::next(CI.I);
555 987 : CI.I->eraseFromParent();
556 987 : CI.Paired->eraseFromParent();
557 :
558 : LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
559 987 : return Next;
560 : }
561 :
562 0 : unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
563 2072 : if (STM->ldsRequiresM0Init())
564 1463 : return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
565 609 : return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
566 : }
567 :
568 0 : unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
569 28 : if (STM->ldsRequiresM0Init())
570 14 : return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
571 :
572 14 : return (EltSize == 4) ?
573 : AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
574 : }
575 :
576 2100 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
577 : CombineInfo &CI) {
578 2100 : MachineBasicBlock *MBB = CI.I->getParent();
579 :
580 : // Be sure to use .addOperand(), and not .addReg() with these. We want to be
581 : // sure we preserve the subregister index and any register flags set on them.
582 2100 : const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
583 2100 : const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
584 : const MachineOperand *Data1
585 2100 : = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
586 :
587 2100 : unsigned NewOffset0 = CI.Offset0;
588 2100 : unsigned NewOffset1 = CI.Offset1;
589 2100 : unsigned Opc = CI.UseST64 ?
590 2100 : write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
591 :
592 2100 : if (NewOffset0 > NewOffset1) {
593 : // Canonicalize the merged instruction so the smaller offset comes first.
594 : std::swap(NewOffset0, NewOffset1);
595 : std::swap(Data0, Data1);
596 : }
597 :
598 : assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
599 : (NewOffset0 != NewOffset1) &&
600 : "Computed offset doesn't fit");
601 :
602 2100 : const MCInstrDesc &Write2Desc = TII->get(Opc);
603 : DebugLoc DL = CI.I->getDebugLoc();
604 :
605 2100 : unsigned BaseReg = AddrReg->getReg();
606 : unsigned BaseSubReg = AddrReg->getSubReg();
607 : unsigned BaseRegFlags = 0;
608 2100 : if (CI.BaseOff) {
609 60 : unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
610 60 : BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
611 30 : .addImm(CI.BaseOff);
612 :
613 60 : BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
614 : BaseRegFlags = RegState::Kill;
615 :
616 60 : TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
617 30 : .addReg(ImmReg)
618 30 : .addReg(AddrReg->getReg(), 0, BaseSubReg);
619 : BaseSubReg = 0;
620 : }
621 :
622 2100 : MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc)
623 2100 : .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
624 : .add(*Data0) // data0
625 : .add(*Data1) // data1
626 2100 : .addImm(NewOffset0) // offset0
627 2100 : .addImm(NewOffset1) // offset1
628 : .addImm(0) // gds
629 4200 : .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
630 :
631 2100 : moveInstsAfter(Write2, CI.InstsToMove);
632 :
633 2100 : MachineBasicBlock::iterator Next = std::next(CI.I);
634 2100 : CI.I->eraseFromParent();
635 2100 : CI.Paired->eraseFromParent();
636 :
637 : LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
638 2100 : return Next;
639 : }
640 :
641 0 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
642 : CombineInfo &CI) {
643 0 : MachineBasicBlock *MBB = CI.I->getParent();
644 : DebugLoc DL = CI.I->getDebugLoc();
645 0 : unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
646 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
647 :
648 : const TargetRegisterClass *SuperRC =
649 0 : CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
650 0 : unsigned DestReg = MRI->createVirtualRegister(SuperRC);
651 0 : unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
652 :
653 0 : BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
654 0 : .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
655 0 : .addImm(MergedOffset) // offset
656 0 : .addImm(CI.GLC0) // glc
657 0 : .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
658 :
659 0 : unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
660 0 : unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
661 :
662 : // Handle descending offsets
663 0 : if (CI.Offset0 > CI.Offset1)
664 : std::swap(SubRegIdx0, SubRegIdx1);
665 :
666 : // Copy to the old destination registers.
667 0 : const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
668 0 : const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
669 0 : const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
670 :
671 0 : BuildMI(*MBB, CI.Paired, DL, CopyDesc)
672 : .add(*Dest0) // Copy to same destination including flags and sub reg.
673 0 : .addReg(DestReg, 0, SubRegIdx0);
674 0 : MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
675 : .add(*Dest1)
676 0 : .addReg(DestReg, RegState::Kill, SubRegIdx1);
677 :
678 0 : moveInstsAfter(Copy1, CI.InstsToMove);
679 :
680 0 : MachineBasicBlock::iterator Next = std::next(CI.I);
681 0 : CI.I->eraseFromParent();
682 0 : CI.Paired->eraseFromParent();
683 0 : return Next;
684 : }
685 :
686 0 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
687 : CombineInfo &CI) {
688 0 : MachineBasicBlock *MBB = CI.I->getParent();
689 : DebugLoc DL = CI.I->getDebugLoc();
690 : unsigned Opcode;
691 :
692 0 : if (CI.InstClass == BUFFER_LOAD_OFFEN) {
693 0 : Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
694 : AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
695 : } else {
696 0 : Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
697 : AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
698 : }
699 :
700 : const TargetRegisterClass *SuperRC =
701 0 : CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
702 0 : unsigned DestReg = MRI->createVirtualRegister(SuperRC);
703 0 : unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
704 :
705 0 : auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
706 :
707 0 : if (CI.InstClass == BUFFER_LOAD_OFFEN)
708 0 : MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
709 :
710 0 : MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
711 0 : .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
712 0 : .addImm(MergedOffset) // offset
713 0 : .addImm(CI.GLC0) // glc
714 0 : .addImm(CI.SLC0) // slc
715 : .addImm(0) // tfe
716 0 : .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
717 :
718 0 : unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
719 0 : unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
720 :
721 : // Handle descending offsets
722 0 : if (CI.Offset0 > CI.Offset1)
723 : std::swap(SubRegIdx0, SubRegIdx1);
724 :
725 : // Copy to the old destination registers.
726 0 : const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
727 0 : const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
728 0 : const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
729 :
730 0 : BuildMI(*MBB, CI.Paired, DL, CopyDesc)
731 : .add(*Dest0) // Copy to same destination including flags and sub reg.
732 0 : .addReg(DestReg, 0, SubRegIdx0);
733 0 : MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
734 : .add(*Dest1)
735 0 : .addReg(DestReg, RegState::Kill, SubRegIdx1);
736 :
737 0 : moveInstsAfter(Copy1, CI.InstsToMove);
738 :
739 0 : MachineBasicBlock::iterator Next = std::next(CI.I);
740 0 : CI.I->eraseFromParent();
741 0 : CI.Paired->eraseFromParent();
742 0 : return Next;
743 : }
744 :
745 0 : unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
746 : const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
747 0 : IsX2 = false;
748 0 : IsOffen = false;
749 :
750 0 : switch (I.getOpcode()) {
751 0 : case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
752 0 : IsOffen = true;
753 0 : return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
754 0 : case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
755 0 : IsOffen = true;
756 0 : return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
757 0 : case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
758 0 : IsX2 = true;
759 0 : IsOffen = true;
760 0 : return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
761 0 : case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
762 0 : IsX2 = true;
763 0 : IsOffen = true;
764 0 : return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
765 : case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
766 : return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
767 0 : case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
768 0 : return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
769 0 : case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
770 0 : IsX2 = true;
771 0 : return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
772 0 : case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
773 0 : IsX2 = true;
774 0 : return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
775 : }
776 0 : return 0;
777 : }
778 :
779 0 : MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
780 : CombineInfo &CI) {
781 0 : MachineBasicBlock *MBB = CI.I->getParent();
782 : DebugLoc DL = CI.I->getDebugLoc();
783 : bool Unused1, Unused2;
784 0 : unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
785 :
786 0 : unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
787 0 : unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
788 :
789 : // Handle descending offsets
790 0 : if (CI.Offset0 > CI.Offset1)
791 : std::swap(SubRegIdx0, SubRegIdx1);
792 :
793 : // Copy to the new source register.
794 : const TargetRegisterClass *SuperRC =
795 0 : CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
796 0 : unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
797 :
798 0 : const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
799 0 : const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
800 :
801 0 : BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
802 : .add(*Src0)
803 0 : .addImm(SubRegIdx0)
804 : .add(*Src1)
805 0 : .addImm(SubRegIdx1);
806 :
807 0 : auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
808 0 : .addReg(SrcReg, RegState::Kill);
809 :
810 0 : if (CI.InstClass == BUFFER_STORE_OFFEN)
811 0 : MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
812 :
813 0 : MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
814 0 : .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
815 0 : .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
816 0 : .addImm(CI.GLC0) // glc
817 0 : .addImm(CI.SLC0) // slc
818 : .addImm(0) // tfe
819 0 : .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
820 :
821 0 : moveInstsAfter(MIB, CI.InstsToMove);
822 :
823 0 : MachineBasicBlock::iterator Next = std::next(CI.I);
824 0 : CI.I->eraseFromParent();
825 0 : CI.Paired->eraseFromParent();
826 0 : return Next;
827 : }
828 :
829 : // Scan through looking for adjacent LDS operations with constant offsets from
830 : // the same base register. We rely on the scheduler to do the hard work of
831 : // clustering nearby loads, and assume these are all adjacent.
832 22025 : bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
833 : bool Modified = false;
834 :
835 468070 : for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
836 : MachineInstr &MI = *I;
837 :
838 : // Don't combine if volatile.
839 446045 : if (MI.hasOrderedMemoryRef()) {
840 : ++I;
841 33113 : continue;
842 : }
843 :
844 : CombineInfo CI;
845 430397 : CI.I = I;
846 430397 : unsigned Opc = MI.getOpcode();
847 430397 : if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
848 428910 : Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
849 :
850 1827 : CI.InstClass = DS_READ_WRITE;
851 1827 : CI.EltSize =
852 1827 : (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
853 :
854 1827 : if (findMatchingInst(CI)) {
855 : Modified = true;
856 987 : I = mergeRead2Pair(CI);
857 : } else {
858 : ++I;
859 : }
860 :
861 1827 : continue;
862 428570 : } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
863 853326 : Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
864 426663 : Opc == AMDGPU::DS_WRITE_B64_gfx9) {
865 2676 : CI.InstClass = DS_READ_WRITE;
866 : CI.EltSize
867 2676 : = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
868 :
869 2676 : if (findMatchingInst(CI)) {
870 : Modified = true;
871 2100 : I = mergeWrite2Pair(CI);
872 : } else {
873 : ++I;
874 : }
875 :
876 2676 : continue;
877 : }
878 851788 : if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
879 425894 : Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
880 : // EltSize is in units of the offset encoding.
881 567 : CI.InstClass = S_BUFFER_LOAD_IMM;
882 567 : CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
883 567 : CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
884 567 : if (findMatchingInst(CI)) {
885 : Modified = true;
886 187 : I = mergeSBufferLoadImmPair(CI);
887 187 : if (!CI.IsX2)
888 145 : CreatedX2++;
889 : } else {
890 : ++I;
891 : }
892 567 : continue;
893 : }
894 850654 : if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
895 425327 : Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
896 846256 : Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
897 423128 : Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
898 3627 : if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
899 : Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
900 2199 : CI.InstClass = BUFFER_LOAD_OFFEN;
901 : else
902 1428 : CI.InstClass = BUFFER_LOAD_OFFSET;
903 :
904 3627 : CI.EltSize = 4;
905 3627 : CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
906 3627 : Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
907 3627 : if (findMatchingInst(CI)) {
908 : Modified = true;
909 92 : I = mergeBufferLoadPair(CI);
910 92 : if (!CI.IsX2)
911 59 : CreatedX2++;
912 : } else {
913 : ++I;
914 : }
915 3627 : continue;
916 : }
917 :
918 : bool StoreIsX2, IsOffen;
919 421700 : if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
920 8768 : CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
921 8768 : CI.EltSize = 4;
922 8768 : CI.IsX2 = StoreIsX2;
923 8768 : if (findMatchingInst(CI)) {
924 : Modified = true;
925 66 : I = mergeBufferStorePair(CI);
926 66 : if (!CI.IsX2)
927 48 : CreatedX2++;
928 : } else {
929 : ++I;
930 : }
931 8768 : continue;
932 : }
933 :
934 : ++I;
935 : }
936 :
937 22025 : return Modified;
938 : }
939 :
940 19540 : bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
941 19540 : if (skipFunction(MF.getFunction()))
942 : return false;
943 :
944 19537 : STM = &MF.getSubtarget<GCNSubtarget>();
945 19537 : if (!STM->loadStoreOptEnabled())
946 : return false;
947 :
948 19536 : TII = STM->getInstrInfo();
949 19536 : TRI = &TII->getRegisterInfo();
950 :
951 19536 : MRI = &MF.getRegInfo();
952 19536 : AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
953 :
954 : assert(MRI->isSSA() && "Must be run on SSA");
955 :
956 : LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
957 :
958 : bool Modified = false;
959 :
960 41501 : for (MachineBasicBlock &MBB : MF) {
961 21965 : CreatedX2 = 0;
962 21965 : Modified |= optimizeBlock(MBB);
963 :
964 : // Run again to convert x2 to x4.
965 21965 : if (CreatedX2 >= 1)
966 60 : Modified |= optimizeBlock(MBB);
967 : }
968 :
969 : return Modified;
970 : }
|