Line data Source code
1 : //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This file contains a pass that performs load / store related peephole
11 : // optimizations. This pass should be run after register allocation.
12 : //
13 : //===----------------------------------------------------------------------===//
14 :
15 : #include "AArch64InstrInfo.h"
16 : #include "AArch64Subtarget.h"
17 : #include "MCTargetDesc/AArch64AddressingModes.h"
18 : #include "llvm/ADT/BitVector.h"
19 : #include "llvm/ADT/SmallVector.h"
20 : #include "llvm/ADT/Statistic.h"
21 : #include "llvm/ADT/StringRef.h"
22 : #include "llvm/ADT/iterator_range.h"
23 : #include "llvm/Analysis/AliasAnalysis.h"
24 : #include "llvm/CodeGen/MachineBasicBlock.h"
25 : #include "llvm/CodeGen/MachineFunction.h"
26 : #include "llvm/CodeGen/MachineFunctionPass.h"
27 : #include "llvm/CodeGen/MachineInstr.h"
28 : #include "llvm/CodeGen/MachineInstrBuilder.h"
29 : #include "llvm/CodeGen/MachineOperand.h"
30 : #include "llvm/CodeGen/TargetRegisterInfo.h"
31 : #include "llvm/IR/DebugLoc.h"
32 : #include "llvm/MC/MCRegisterInfo.h"
33 : #include "llvm/Pass.h"
34 : #include "llvm/Support/CommandLine.h"
35 : #include "llvm/Support/Debug.h"
36 : #include "llvm/Support/ErrorHandling.h"
37 : #include "llvm/Support/raw_ostream.h"
38 : #include <cassert>
39 : #include <cstdint>
40 : #include <iterator>
41 : #include <limits>
42 :
43 : using namespace llvm;
44 :
45 : #define DEBUG_TYPE "aarch64-ldst-opt"
46 :
47 : STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
48 : STATISTIC(NumPostFolded, "Number of post-index updates folded");
49 : STATISTIC(NumPreFolded, "Number of pre-index updates folded");
50 : STATISTIC(NumUnscaledPairCreated,
51 : "Number of load/store from unscaled generated");
52 : STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
53 : STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
54 :
55 : // The LdStLimit limits how far we search for load/store pairs.
56 : static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
57 : cl::init(20), cl::Hidden);
58 :
59 : // The UpdateLimit limits how far we search for update instructions when we form
60 : // pre-/post-index instructions.
61 : static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
62 : cl::Hidden);
63 :
64 : #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
65 :
66 : namespace {
67 :
68 : using LdStPairFlags = struct LdStPairFlags {
69 : // If a matching instruction is found, MergeForward is set to true if the
70 : // merge is to remove the first instruction and replace the second with
71 : // a pair-wise insn, and false if the reverse is true.
72 : bool MergeForward = false;
73 :
74 : // SExtIdx gives the index of the result of the load pair that must be
75 : // extended. The value of SExtIdx assumes that the paired load produces the
76 : // value in this order: (I, returned iterator), i.e., -1 means no value has
77 : // to be extended, 0 means I, and 1 means the returned iterator.
78 : int SExtIdx = -1;
79 :
80 : LdStPairFlags() = default;
81 :
82 941 : void setMergeForward(bool V = true) { MergeForward = V; }
83 0 : bool getMergeForward() const { return MergeForward; }
84 :
85 26416 : void setSExtIdx(int V) { SExtIdx = V; }
86 0 : int getSExtIdx() const { return SExtIdx; }
87 : };
88 :
89 : struct AArch64LoadStoreOpt : public MachineFunctionPass {
90 : static char ID;
91 :
92 1118 : AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
93 1118 : initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
94 1118 : }
95 :
96 : AliasAnalysis *AA;
97 : const AArch64InstrInfo *TII;
98 : const TargetRegisterInfo *TRI;
99 : const AArch64Subtarget *Subtarget;
100 :
101 : // Track which register units have been modified and used.
102 : LiveRegUnits ModifiedRegUnits, UsedRegUnits;
103 :
104 1104 : void getAnalysisUsage(AnalysisUsage &AU) const override {
105 : AU.addRequired<AAResultsWrapperPass>();
106 1104 : MachineFunctionPass::getAnalysisUsage(AU);
107 1104 : }
108 :
109 : // Scan the instructions looking for a load/store that can be combined
110 : // with the current instruction into a load/store pair.
111 : // Return the matching instruction if one is found, else MBB->end().
112 : MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
113 : LdStPairFlags &Flags,
114 : unsigned Limit,
115 : bool FindNarrowMerge);
116 :
117 : // Scan the instructions looking for a store that writes to the address from
118 : // which the current load instruction reads. Return true if one is found.
119 : bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
120 : MachineBasicBlock::iterator &StoreI);
121 :
122 : // Merge the two instructions indicated into a wider narrow store instruction.
123 : MachineBasicBlock::iterator
124 : mergeNarrowZeroStores(MachineBasicBlock::iterator I,
125 : MachineBasicBlock::iterator MergeMI,
126 : const LdStPairFlags &Flags);
127 :
128 : // Merge the two instructions indicated into a single pair-wise instruction.
129 : MachineBasicBlock::iterator
130 : mergePairedInsns(MachineBasicBlock::iterator I,
131 : MachineBasicBlock::iterator Paired,
132 : const LdStPairFlags &Flags);
133 :
134 : // Promote the load that reads directly from the address stored to.
135 : MachineBasicBlock::iterator
136 : promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
137 : MachineBasicBlock::iterator StoreI);
138 :
139 : // Scan the instruction list to find a base register update that can
140 : // be combined with the current instruction (a load or store) using
141 : // pre or post indexed addressing with writeback. Scan forwards.
142 : MachineBasicBlock::iterator
143 : findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
144 : int UnscaledOffset, unsigned Limit);
145 :
146 : // Scan the instruction list to find a base register update that can
147 : // be combined with the current instruction (a load or store) using
148 : // pre or post indexed addressing with writeback. Scan backwards.
149 : MachineBasicBlock::iterator
150 : findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
151 :
152 : // Find an instruction that updates the base register of the ld/st
153 : // instruction.
154 : bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
155 : unsigned BaseReg, int Offset);
156 :
157 : // Merge a pre- or post-index base register update into a ld/st instruction.
158 : MachineBasicBlock::iterator
159 : mergeUpdateInsn(MachineBasicBlock::iterator I,
160 : MachineBasicBlock::iterator Update, bool IsPreIdx);
161 :
162 : // Find and merge zero store instructions.
163 : bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
164 :
165 : // Find and pair ldr/str instructions.
166 : bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
167 :
168 : // Find and promote load instructions which read directly from store.
169 : bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
170 :
171 : // Find and merge a base register updates before or after a ld/st instruction.
172 : bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
173 :
174 : bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
175 :
176 : bool runOnMachineFunction(MachineFunction &Fn) override;
177 :
178 1104 : MachineFunctionProperties getRequiredProperties() const override {
179 1104 : return MachineFunctionProperties().set(
180 1104 : MachineFunctionProperties::Property::NoVRegs);
181 : }
182 :
183 1111 : StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
184 : };
185 :
186 : char AArch64LoadStoreOpt::ID = 0;
187 :
188 : } // end anonymous namespace
189 :
190 200150 : INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
191 : AARCH64_LOAD_STORE_OPT_NAME, false, false)
192 :
193 : static bool isNarrowStore(unsigned Opc) {
194 : switch (Opc) {
195 : default:
196 : return false;
197 : case AArch64::STRBBui:
198 : case AArch64::STURBBi:
199 : case AArch64::STRHHui:
200 : case AArch64::STURHHi:
201 : return true;
202 : }
203 : }
204 :
205 : // Scaling factor for unscaled load or store.
206 36515 : static int getMemScale(MachineInstr &MI) {
207 73030 : switch (MI.getOpcode()) {
208 0 : default:
209 0 : llvm_unreachable("Opcode has unknown scale!");
210 : case AArch64::LDRBBui:
211 : case AArch64::LDURBBi:
212 : case AArch64::LDRSBWui:
213 : case AArch64::LDURSBWi:
214 : case AArch64::STRBBui:
215 : case AArch64::STURBBi:
216 : return 1;
217 303 : case AArch64::LDRHHui:
218 : case AArch64::LDURHHi:
219 : case AArch64::LDRSHWui:
220 : case AArch64::LDURSHWi:
221 : case AArch64::STRHHui:
222 : case AArch64::STURHHi:
223 303 : return 2;
224 3700 : case AArch64::LDRSui:
225 : case AArch64::LDURSi:
226 : case AArch64::LDRSWui:
227 : case AArch64::LDURSWi:
228 : case AArch64::LDRWui:
229 : case AArch64::LDURWi:
230 : case AArch64::STRSui:
231 : case AArch64::STURSi:
232 : case AArch64::STRWui:
233 : case AArch64::STURWi:
234 : case AArch64::LDPSi:
235 : case AArch64::LDPSWi:
236 : case AArch64::LDPWi:
237 : case AArch64::STPSi:
238 : case AArch64::STPWi:
239 3700 : return 4;
240 25731 : case AArch64::LDRDui:
241 : case AArch64::LDURDi:
242 : case AArch64::LDRXui:
243 : case AArch64::LDURXi:
244 : case AArch64::STRDui:
245 : case AArch64::STURDi:
246 : case AArch64::STRXui:
247 : case AArch64::STURXi:
248 : case AArch64::LDPDi:
249 : case AArch64::LDPXi:
250 : case AArch64::STPDi:
251 : case AArch64::STPXi:
252 25731 : return 8;
253 6018 : case AArch64::LDRQui:
254 : case AArch64::LDURQi:
255 : case AArch64::STRQui:
256 : case AArch64::STURQi:
257 : case AArch64::LDPQi:
258 : case AArch64::STPQi:
259 6018 : return 16;
260 : }
261 : }
262 :
263 24422 : static unsigned getMatchingNonSExtOpcode(unsigned Opc,
264 : bool *IsValidLdStrOpc = nullptr) {
265 24422 : if (IsValidLdStrOpc)
266 24416 : *IsValidLdStrOpc = true;
267 24422 : switch (Opc) {
268 9918 : default:
269 9918 : if (IsValidLdStrOpc)
270 9918 : *IsValidLdStrOpc = false;
271 : return std::numeric_limits<unsigned>::max();
272 : case AArch64::STRDui:
273 : case AArch64::STURDi:
274 : case AArch64::STRQui:
275 : case AArch64::STURQi:
276 : case AArch64::STRBBui:
277 : case AArch64::STURBBi:
278 : case AArch64::STRHHui:
279 : case AArch64::STURHHi:
280 : case AArch64::STRWui:
281 : case AArch64::STURWi:
282 : case AArch64::STRXui:
283 : case AArch64::STURXi:
284 : case AArch64::LDRDui:
285 : case AArch64::LDURDi:
286 : case AArch64::LDRQui:
287 : case AArch64::LDURQi:
288 : case AArch64::LDRWui:
289 : case AArch64::LDURWi:
290 : case AArch64::LDRXui:
291 : case AArch64::LDURXi:
292 : case AArch64::STRSui:
293 : case AArch64::STURSi:
294 : case AArch64::LDRSui:
295 : case AArch64::LDURSi:
296 : return Opc;
297 16 : case AArch64::LDRSWui:
298 16 : return AArch64::LDRWui;
299 10 : case AArch64::LDURSWi:
300 10 : return AArch64::LDURWi;
301 : }
302 : }
303 :
304 : static unsigned getMatchingWideOpcode(unsigned Opc) {
305 0 : switch (Opc) {
306 0 : default:
307 0 : llvm_unreachable("Opcode has no wide equivalent!");
308 : case AArch64::STRBBui:
309 : return AArch64::STRHHui;
310 0 : case AArch64::STRHHui:
311 : return AArch64::STRWui;
312 0 : case AArch64::STURBBi:
313 : return AArch64::STURHHi;
314 0 : case AArch64::STURHHi:
315 : return AArch64::STURWi;
316 0 : case AArch64::STURWi:
317 : return AArch64::STURXi;
318 0 : case AArch64::STRWui:
319 : return AArch64::STRXui;
320 : }
321 : }
322 :
323 1208 : static unsigned getMatchingPairOpcode(unsigned Opc) {
324 1208 : switch (Opc) {
325 0 : default:
326 0 : llvm_unreachable("Opcode has no pairwise equivalent!");
327 : case AArch64::STRSui:
328 : case AArch64::STURSi:
329 : return AArch64::STPSi;
330 63 : case AArch64::STRDui:
331 : case AArch64::STURDi:
332 63 : return AArch64::STPDi;
333 222 : case AArch64::STRQui:
334 : case AArch64::STURQi:
335 222 : return AArch64::STPQi;
336 115 : case AArch64::STRWui:
337 : case AArch64::STURWi:
338 115 : return AArch64::STPWi;
339 230 : case AArch64::STRXui:
340 : case AArch64::STURXi:
341 230 : return AArch64::STPXi;
342 58 : case AArch64::LDRSui:
343 : case AArch64::LDURSi:
344 58 : return AArch64::LDPSi;
345 153 : case AArch64::LDRDui:
346 : case AArch64::LDURDi:
347 153 : return AArch64::LDPDi;
348 136 : case AArch64::LDRQui:
349 : case AArch64::LDURQi:
350 136 : return AArch64::LDPQi;
351 43 : case AArch64::LDRWui:
352 : case AArch64::LDURWi:
353 43 : return AArch64::LDPWi;
354 159 : case AArch64::LDRXui:
355 : case AArch64::LDURXi:
356 159 : return AArch64::LDPXi;
357 11 : case AArch64::LDRSWui:
358 : case AArch64::LDURSWi:
359 11 : return AArch64::LDPSWi;
360 : }
361 : }
362 :
363 3299 : static unsigned isMatchingStore(MachineInstr &LoadInst,
364 : MachineInstr &StoreInst) {
365 3299 : unsigned LdOpc = LoadInst.getOpcode();
366 3299 : unsigned StOpc = StoreInst.getOpcode();
367 3299 : switch (LdOpc) {
368 0 : default:
369 0 : llvm_unreachable("Unsupported load instruction!");
370 15 : case AArch64::LDRBBui:
371 15 : return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
372 17 : StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
373 11 : case AArch64::LDURBBi:
374 11 : return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
375 11 : StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
376 13 : case AArch64::LDRHHui:
377 13 : return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
378 13 : StOpc == AArch64::STRXui;
379 4 : case AArch64::LDURHHi:
380 4 : return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
381 4 : StOpc == AArch64::STURXi;
382 48 : case AArch64::LDRWui:
383 48 : return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
384 4 : case AArch64::LDURWi:
385 4 : return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
386 3202 : case AArch64::LDRXui:
387 3202 : return StOpc == AArch64::STRXui;
388 2 : case AArch64::LDURXi:
389 2 : return StOpc == AArch64::STURXi;
390 : }
391 : }
392 :
393 90 : static unsigned getPreIndexedOpcode(unsigned Opc) {
394 : // FIXME: We don't currently support creating pre-indexed loads/stores when
395 : // the load or store is the unscaled version. If we decide to perform such an
396 : // optimization in the future the cases for the unscaled loads/stores will
397 : // need to be added here.
398 90 : switch (Opc) {
399 0 : default:
400 0 : llvm_unreachable("Opcode has no pre-indexed equivalent!");
401 : case AArch64::STRSui:
402 : return AArch64::STRSpre;
403 6 : case AArch64::STRDui:
404 6 : return AArch64::STRDpre;
405 9 : case AArch64::STRQui:
406 9 : return AArch64::STRQpre;
407 2 : case AArch64::STRBBui:
408 2 : return AArch64::STRBBpre;
409 2 : case AArch64::STRHHui:
410 2 : return AArch64::STRHHpre;
411 8 : case AArch64::STRWui:
412 8 : return AArch64::STRWpre;
413 9 : case AArch64::STRXui:
414 9 : return AArch64::STRXpre;
415 6 : case AArch64::LDRSui:
416 6 : return AArch64::LDRSpre;
417 6 : case AArch64::LDRDui:
418 6 : return AArch64::LDRDpre;
419 6 : case AArch64::LDRQui:
420 6 : return AArch64::LDRQpre;
421 2 : case AArch64::LDRBBui:
422 2 : return AArch64::LDRBBpre;
423 2 : case AArch64::LDRHHui:
424 2 : return AArch64::LDRHHpre;
425 6 : case AArch64::LDRWui:
426 6 : return AArch64::LDRWpre;
427 6 : case AArch64::LDRXui:
428 6 : return AArch64::LDRXpre;
429 0 : case AArch64::LDRSWui:
430 0 : return AArch64::LDRSWpre;
431 0 : case AArch64::LDPSi:
432 0 : return AArch64::LDPSpre;
433 0 : case AArch64::LDPSWi:
434 0 : return AArch64::LDPSWpre;
435 0 : case AArch64::LDPDi:
436 0 : return AArch64::LDPDpre;
437 0 : case AArch64::LDPQi:
438 0 : return AArch64::LDPQpre;
439 2 : case AArch64::LDPWi:
440 2 : return AArch64::LDPWpre;
441 0 : case AArch64::LDPXi:
442 0 : return AArch64::LDPXpre;
443 0 : case AArch64::STPSi:
444 0 : return AArch64::STPSpre;
445 0 : case AArch64::STPDi:
446 0 : return AArch64::STPDpre;
447 1 : case AArch64::STPQi:
448 1 : return AArch64::STPQpre;
449 2 : case AArch64::STPWi:
450 2 : return AArch64::STPWpre;
451 9 : case AArch64::STPXi:
452 9 : return AArch64::STPXpre;
453 : }
454 : }
455 :
456 109 : static unsigned getPostIndexedOpcode(unsigned Opc) {
457 109 : switch (Opc) {
458 0 : default:
459 0 : llvm_unreachable("Opcode has no post-indexed wise equivalent!");
460 : case AArch64::STRSui:
461 : case AArch64::STURSi:
462 : return AArch64::STRSpost;
463 5 : case AArch64::STRDui:
464 : case AArch64::STURDi:
465 5 : return AArch64::STRDpost;
466 19 : case AArch64::STRQui:
467 : case AArch64::STURQi:
468 19 : return AArch64::STRQpost;
469 2 : case AArch64::STRBBui:
470 2 : return AArch64::STRBBpost;
471 2 : case AArch64::STRHHui:
472 2 : return AArch64::STRHHpost;
473 5 : case AArch64::STRWui:
474 : case AArch64::STURWi:
475 5 : return AArch64::STRWpost;
476 7 : case AArch64::STRXui:
477 : case AArch64::STURXi:
478 7 : return AArch64::STRXpost;
479 5 : case AArch64::LDRSui:
480 : case AArch64::LDURSi:
481 5 : return AArch64::LDRSpost;
482 6 : case AArch64::LDRDui:
483 : case AArch64::LDURDi:
484 6 : return AArch64::LDRDpost;
485 12 : case AArch64::LDRQui:
486 : case AArch64::LDURQi:
487 12 : return AArch64::LDRQpost;
488 2 : case AArch64::LDRBBui:
489 2 : return AArch64::LDRBBpost;
490 2 : case AArch64::LDRHHui:
491 2 : return AArch64::LDRHHpost;
492 5 : case AArch64::LDRWui:
493 : case AArch64::LDURWi:
494 5 : return AArch64::LDRWpost;
495 11 : case AArch64::LDRXui:
496 : case AArch64::LDURXi:
497 11 : return AArch64::LDRXpost;
498 0 : case AArch64::LDRSWui:
499 0 : return AArch64::LDRSWpost;
500 0 : case AArch64::LDPSi:
501 0 : return AArch64::LDPSpost;
502 1 : case AArch64::LDPSWi:
503 1 : return AArch64::LDPSWpost;
504 0 : case AArch64::LDPDi:
505 0 : return AArch64::LDPDpost;
506 0 : case AArch64::LDPQi:
507 0 : return AArch64::LDPQpost;
508 0 : case AArch64::LDPWi:
509 0 : return AArch64::LDPWpost;
510 5 : case AArch64::LDPXi:
511 5 : return AArch64::LDPXpost;
512 2 : case AArch64::STPSi:
513 2 : return AArch64::STPSpost;
514 4 : case AArch64::STPDi:
515 4 : return AArch64::STPDpost;
516 2 : case AArch64::STPQi:
517 2 : return AArch64::STPQpost;
518 2 : case AArch64::STPWi:
519 2 : return AArch64::STPWpost;
520 5 : case AArch64::STPXi:
521 5 : return AArch64::STPXpost;
522 : }
523 : }
524 :
525 114392 : static bool isPairedLdSt(const MachineInstr &MI) {
526 228784 : switch (MI.getOpcode()) {
527 : default:
528 : return false;
529 21833 : case AArch64::LDPSi:
530 : case AArch64::LDPSWi:
531 : case AArch64::LDPDi:
532 : case AArch64::LDPQi:
533 : case AArch64::LDPWi:
534 : case AArch64::LDPXi:
535 : case AArch64::STPSi:
536 : case AArch64::STPDi:
537 : case AArch64::STPQi:
538 : case AArch64::STPWi:
539 : case AArch64::STPXi:
540 21833 : return true;
541 : }
542 : }
543 :
544 : static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
545 : unsigned PairedRegOp = 0) {
546 : assert(PairedRegOp < 2 && "Unexpected register operand idx.");
547 22700 : unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
548 22899 : return MI.getOperand(Idx);
549 : }
550 :
551 : static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
552 29713 : unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
553 35761 : return MI.getOperand(Idx);
554 : }
555 :
556 : static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
557 39786 : unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
558 39786 : return MI.getOperand(Idx);
559 : }
560 :
561 0 : static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
562 : MachineInstr &StoreInst,
563 : const AArch64InstrInfo *TII) {
564 : assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
565 0 : int LoadSize = getMemScale(LoadInst);
566 0 : int StoreSize = getMemScale(StoreInst);
567 : int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
568 0 : ? getLdStOffsetOp(StoreInst).getImm()
569 0 : : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
570 : int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
571 0 : ? getLdStOffsetOp(LoadInst).getImm()
572 0 : : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
573 0 : return (UnscaledStOffset <= UnscaledLdOffset) &&
574 0 : (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
575 : }
576 :
577 : static bool isPromotableZeroStoreInst(MachineInstr &MI) {
578 81761 : unsigned Opc = MI.getOpcode();
579 81761 : return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
580 81780 : isNarrowStore(Opc)) &&
581 1091 : getLdStRegOp(MI).getReg() == AArch64::WZR;
582 : }
583 :
584 75914 : static bool isPromotableLoadFromStore(MachineInstr &MI) {
585 151828 : switch (MI.getOpcode()) {
586 : default:
587 : return false;
588 : // Scaled instructions.
589 3781 : case AArch64::LDRBBui:
590 : case AArch64::LDRHHui:
591 : case AArch64::LDRWui:
592 : case AArch64::LDRXui:
593 : // Unscaled instructions.
594 : case AArch64::LDURBBi:
595 : case AArch64::LDURHHi:
596 : case AArch64::LDURWi:
597 : case AArch64::LDURXi:
598 3781 : return true;
599 : }
600 : }
601 :
602 74837 : static bool isMergeableLdStUpdate(MachineInstr &MI) {
603 74837 : unsigned Opc = MI.getOpcode();
604 74837 : switch (Opc) {
605 : default:
606 : return false;
607 : // Scaled instructions.
608 : case AArch64::STRSui:
609 : case AArch64::STRDui:
610 : case AArch64::STRQui:
611 : case AArch64::STRXui:
612 : case AArch64::STRWui:
613 : case AArch64::STRHHui:
614 : case AArch64::STRBBui:
615 : case AArch64::LDRSui:
616 : case AArch64::LDRDui:
617 : case AArch64::LDRQui:
618 : case AArch64::LDRXui:
619 : case AArch64::LDRWui:
620 : case AArch64::LDRHHui:
621 : case AArch64::LDRBBui:
622 : // Unscaled instructions.
623 : case AArch64::STURSi:
624 : case AArch64::STURDi:
625 : case AArch64::STURQi:
626 : case AArch64::STURWi:
627 : case AArch64::STURXi:
628 : case AArch64::LDURSi:
629 : case AArch64::LDURDi:
630 : case AArch64::LDURQi:
631 : case AArch64::LDURWi:
632 : case AArch64::LDURXi:
633 : // Paired instructions.
634 : case AArch64::LDPSi:
635 : case AArch64::LDPSWi:
636 : case AArch64::LDPDi:
637 : case AArch64::LDPQi:
638 : case AArch64::LDPWi:
639 : case AArch64::LDPXi:
640 : case AArch64::STPSi:
641 : case AArch64::STPDi:
642 : case AArch64::STPQi:
643 : case AArch64::STPWi:
644 : case AArch64::STPXi:
645 : // Make sure this is a reg+imm (as opposed to an address reloc).
646 13739 : if (!getLdStOffsetOp(MI).isImm())
647 1622 : return false;
648 :
649 : return true;
650 : }
651 : }
652 :
653 : MachineBasicBlock::iterator
654 0 : AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
655 : MachineBasicBlock::iterator MergeMI,
656 : const LdStPairFlags &Flags) {
657 : assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
658 : "Expected promotable zero stores.");
659 :
660 0 : MachineBasicBlock::iterator NextI = I;
661 : ++NextI;
662 : // If NextI is the second of the two instructions to be merged, we need
663 : // to skip one further. Either way we merge will invalidate the iterator,
664 : // and we don't need to scan the new instruction, as it's a pairwise
665 : // instruction, which we're not considering for further action anyway.
666 0 : if (NextI == MergeMI)
667 : ++NextI;
668 :
669 0 : unsigned Opc = I->getOpcode();
670 0 : bool IsScaled = !TII->isUnscaledLdSt(Opc);
671 0 : int OffsetStride = IsScaled ? 1 : getMemScale(*I);
672 :
673 0 : bool MergeForward = Flags.getMergeForward();
674 : // Insert our new paired instruction after whichever of the paired
675 : // instructions MergeForward indicates.
676 0 : MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
677 : // Also based on MergeForward is from where we copy the base register operand
678 : // so we get the flags compatible with the input code.
679 : const MachineOperand &BaseRegOp =
680 0 : MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
681 :
682 : // Which register is Rt and which is Rt2 depends on the offset order.
683 : MachineInstr *RtMI;
684 0 : if (getLdStOffsetOp(*I).getImm() ==
685 0 : getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
686 : RtMI = &*MergeMI;
687 : else
688 : RtMI = &*I;
689 :
690 0 : int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
691 : // Change the scaled offset from small to large type.
692 0 : if (IsScaled) {
693 : assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
694 0 : OffsetImm /= 2;
695 : }
696 :
697 : // Construct the new instruction.
698 : DebugLoc DL = I->getDebugLoc();
699 0 : MachineBasicBlock *MBB = I->getParent();
700 : MachineInstrBuilder MIB;
701 0 : MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
702 0 : .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
703 : .add(BaseRegOp)
704 0 : .addImm(OffsetImm)
705 0 : .cloneMergedMemRefs({&*I, &*MergeMI})
706 0 : .setMIFlags(I->mergeFlagsWith(*MergeMI));
707 : (void)MIB;
708 :
709 : LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
710 : LLVM_DEBUG(I->print(dbgs()));
711 : LLVM_DEBUG(dbgs() << " ");
712 : LLVM_DEBUG(MergeMI->print(dbgs()));
713 : LLVM_DEBUG(dbgs() << " with instruction:\n ");
714 : LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
715 : LLVM_DEBUG(dbgs() << "\n");
716 :
717 : // Erase the old instructions.
718 0 : I->eraseFromParent();
719 0 : MergeMI->eraseFromParent();
720 0 : return NextI;
721 : }
722 :
723 : MachineBasicBlock::iterator
724 0 : AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
725 : MachineBasicBlock::iterator Paired,
726 : const LdStPairFlags &Flags) {
727 0 : MachineBasicBlock::iterator NextI = I;
728 : ++NextI;
729 : // If NextI is the second of the two instructions to be merged, we need
730 : // to skip one further. Either way we merge will invalidate the iterator,
731 : // and we don't need to scan the new instruction, as it's a pairwise
732 : // instruction, which we're not considering for further action anyway.
733 0 : if (NextI == Paired)
734 : ++NextI;
735 :
736 0 : int SExtIdx = Flags.getSExtIdx();
737 : unsigned Opc =
738 0 : SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
739 0 : bool IsUnscaled = TII->isUnscaledLdSt(Opc);
740 0 : int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
741 :
742 0 : bool MergeForward = Flags.getMergeForward();
743 : // Insert our new paired instruction after whichever of the paired
744 : // instructions MergeForward indicates.
745 0 : MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
746 : // Also based on MergeForward is from where we copy the base register operand
747 : // so we get the flags compatible with the input code.
748 : const MachineOperand &BaseRegOp =
749 0 : MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
750 :
751 0 : int Offset = getLdStOffsetOp(*I).getImm();
752 0 : int PairedOffset = getLdStOffsetOp(*Paired).getImm();
753 0 : bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
754 0 : if (IsUnscaled != PairedIsUnscaled) {
755 : // We're trying to pair instructions that differ in how they are scaled. If
756 : // I is scaled then scale the offset of Paired accordingly. Otherwise, do
757 : // the opposite (i.e., make Paired's offset unscaled).
758 0 : int MemSize = getMemScale(*Paired);
759 0 : if (PairedIsUnscaled) {
760 : // If the unscaled offset isn't a multiple of the MemSize, we can't
761 : // pair the operations together.
762 : assert(!(PairedOffset % getMemScale(*Paired)) &&
763 : "Offset should be a multiple of the stride!");
764 0 : PairedOffset /= MemSize;
765 : } else {
766 0 : PairedOffset *= MemSize;
767 : }
768 : }
769 :
770 : // Which register is Rt and which is Rt2 depends on the offset order.
771 : MachineInstr *RtMI, *Rt2MI;
772 0 : if (Offset == PairedOffset + OffsetStride) {
773 : RtMI = &*Paired;
774 : Rt2MI = &*I;
775 : // Here we swapped the assumption made for SExtIdx.
776 : // I.e., we turn ldp I, Paired into ldp Paired, I.
777 : // Update the index accordingly.
778 0 : if (SExtIdx != -1)
779 0 : SExtIdx = (SExtIdx + 1) % 2;
780 : } else {
781 : RtMI = &*I;
782 : Rt2MI = &*Paired;
783 : }
784 0 : int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
785 : // Scale the immediate offset, if necessary.
786 0 : if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
787 : assert(!(OffsetImm % getMemScale(*RtMI)) &&
788 : "Unscaled offset cannot be scaled.");
789 0 : OffsetImm /= getMemScale(*RtMI);
790 : }
791 :
792 : // Construct the new instruction.
793 : MachineInstrBuilder MIB;
794 : DebugLoc DL = I->getDebugLoc();
795 0 : MachineBasicBlock *MBB = I->getParent();
796 0 : MachineOperand RegOp0 = getLdStRegOp(*RtMI);
797 0 : MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
798 : // Kill flags may become invalid when moving stores for pairing.
799 0 : if (RegOp0.isUse()) {
800 0 : if (!MergeForward) {
801 : // Clear kill flags on store if moving upwards. Example:
802 : // STRWui %w0, ...
803 : // USE %w1
804 : // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
805 : RegOp0.setIsKill(false);
806 : RegOp1.setIsKill(false);
807 : } else {
808 : // Clear kill flags of the first stores register. Example:
809 : // STRWui %w1, ...
810 : // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
811 : // STRW %w0
812 0 : unsigned Reg = getLdStRegOp(*I).getReg();
813 0 : for (MachineInstr &MI : make_range(std::next(I), Paired))
814 0 : MI.clearRegisterKills(Reg, TRI);
815 : }
816 : }
817 0 : MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
818 : .add(RegOp0)
819 : .add(RegOp1)
820 : .add(BaseRegOp)
821 0 : .addImm(OffsetImm)
822 0 : .cloneMergedMemRefs({&*I, &*Paired})
823 0 : .setMIFlags(I->mergeFlagsWith(*Paired));
824 :
825 : (void)MIB;
826 :
827 : LLVM_DEBUG(
828 : dbgs() << "Creating pair load/store. Replacing instructions:\n ");
829 : LLVM_DEBUG(I->print(dbgs()));
830 : LLVM_DEBUG(dbgs() << " ");
831 : LLVM_DEBUG(Paired->print(dbgs()));
832 : LLVM_DEBUG(dbgs() << " with instruction:\n ");
833 0 : if (SExtIdx != -1) {
834 : // Generate the sign extension for the proper result of the ldp.
835 : // I.e., with X1, that would be:
836 : // %w1 = KILL %w1, implicit-def %x1
837 : // %x1 = SBFMXri killed %x1, 0, 31
838 0 : MachineOperand &DstMO = MIB->getOperand(SExtIdx);
839 : // Right now, DstMO has the extended register, since it comes from an
840 : // extended opcode.
841 0 : unsigned DstRegX = DstMO.getReg();
842 : // Get the W variant of that register.
843 0 : unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
844 : // Update the result of LDP to use the W instead of the X variant.
845 0 : DstMO.setReg(DstRegW);
846 : LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
847 : LLVM_DEBUG(dbgs() << "\n");
848 : // Make the machine verifier happy by providing a definition for
849 : // the X register.
850 : // Insert this definition right after the generated LDP, i.e., before
851 : // InsertionPoint.
852 : MachineInstrBuilder MIBKill =
853 0 : BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
854 0 : .addReg(DstRegW)
855 0 : .addReg(DstRegX, RegState::Define);
856 0 : MIBKill->getOperand(2).setImplicit();
857 : // Create the sign extension.
858 : MachineInstrBuilder MIBSXTW =
859 0 : BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
860 0 : .addReg(DstRegX)
861 : .addImm(0)
862 : .addImm(31);
863 : (void)MIBSXTW;
864 : LLVM_DEBUG(dbgs() << " Extend operand:\n ");
865 : LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
866 : } else {
867 : LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
868 : }
869 : LLVM_DEBUG(dbgs() << "\n");
870 :
871 : // Erase the old instructions.
872 0 : I->eraseFromParent();
873 0 : Paired->eraseFromParent();
874 :
875 0 : return NextI;
876 : }
877 :
878 : MachineBasicBlock::iterator
879 43 : AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
880 : MachineBasicBlock::iterator StoreI) {
881 : MachineBasicBlock::iterator NextI = LoadI;
882 : ++NextI;
883 :
884 43 : int LoadSize = getMemScale(*LoadI);
885 43 : int StoreSize = getMemScale(*StoreI);
886 43 : unsigned LdRt = getLdStRegOp(*LoadI).getReg();
887 : const MachineOperand &StMO = getLdStRegOp(*StoreI);
888 43 : unsigned StRt = getLdStRegOp(*StoreI).getReg();
889 86 : bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
890 :
891 : assert((IsStoreXReg ||
892 : TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
893 : "Unexpected RegClass");
894 :
895 : MachineInstr *BitExtMI;
896 43 : if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
897 : // Remove the load, if the destination register of the loads is the same
898 : // register for stored value.
899 6 : if (StRt == LdRt && LoadSize == 8) {
900 : for (MachineInstr &MI : make_range(StoreI->getIterator(),
901 3 : LoadI->getIterator())) {
902 3 : if (MI.killsRegister(StRt, TRI)) {
903 2 : MI.clearRegisterKills(StRt, TRI);
904 2 : break;
905 : }
906 : }
907 : LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
908 : LLVM_DEBUG(LoadI->print(dbgs()));
909 : LLVM_DEBUG(dbgs() << "\n");
910 2 : LoadI->eraseFromParent();
911 2 : return NextI;
912 : }
913 : // Replace the load with a mov if the load and store are in the same size.
914 : BitExtMI =
915 8 : BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
916 12 : TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
917 8 : .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
918 : .add(StMO)
919 : .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
920 8 : .setMIFlags(LoadI->getFlags());
921 : } else {
922 : // FIXME: Currently we disable this transformation in big-endian targets as
923 : // performance and correctness are verified only in little-endian.
924 37 : if (!Subtarget->isLittleEndian())
925 0 : return NextI;
926 : bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
927 : assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
928 : "Unsupported ld/st match");
929 : assert(LoadSize <= StoreSize && "Invalid load size");
930 : int UnscaledLdOffset = IsUnscaled
931 16 : ? getLdStOffsetOp(*LoadI).getImm()
932 74 : : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
933 : int UnscaledStOffset = IsUnscaled
934 16 : ? getLdStOffsetOp(*StoreI).getImm()
935 74 : : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
936 37 : int Width = LoadSize * 8;
937 37 : int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
938 37 : int Imms = Immr + Width - 1;
939 : unsigned DestReg = IsStoreXReg
940 37 : ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
941 : &AArch64::GPR64RegClass)
942 : : LdRt;
943 :
944 : assert((UnscaledLdOffset >= UnscaledStOffset &&
945 : (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
946 : "Invalid offset");
947 :
948 : Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
949 : Imms = Immr + Width - 1;
950 37 : if (UnscaledLdOffset == UnscaledStOffset) {
951 2 : uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
952 2 : | ((Immr) << 6) // immr
953 2 : | ((Imms) << 0) // imms
954 : ;
955 :
956 : BitExtMI =
957 2 : BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
958 2 : TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
959 6 : DestReg)
960 : .add(StMO)
961 2 : .addImm(AndMaskEncoded)
962 2 : .setMIFlags(LoadI->getFlags());
963 : } else {
964 : BitExtMI =
965 35 : BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
966 35 : TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
967 61 : DestReg)
968 : .add(StMO)
969 35 : .addImm(Immr)
970 35 : .addImm(Imms)
971 35 : .setMIFlags(LoadI->getFlags());
972 : }
973 : }
974 :
975 : // Clear kill flags between store and load.
976 : for (MachineInstr &MI : make_range(StoreI->getIterator(),
977 48 : BitExtMI->getIterator()))
978 43 : if (MI.killsRegister(StRt, TRI)) {
979 36 : MI.clearRegisterKills(StRt, TRI);
980 36 : break;
981 : }
982 :
983 : LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
984 : LLVM_DEBUG(StoreI->print(dbgs()));
985 : LLVM_DEBUG(dbgs() << " ");
986 : LLVM_DEBUG(LoadI->print(dbgs()));
987 : LLVM_DEBUG(dbgs() << " with instructions:\n ");
988 : LLVM_DEBUG(StoreI->print(dbgs()));
989 : LLVM_DEBUG(dbgs() << " ");
990 : LLVM_DEBUG((BitExtMI)->print(dbgs()));
991 : LLVM_DEBUG(dbgs() << "\n");
992 :
993 : // Erase the old instructions.
994 41 : LoadI->eraseFromParent();
995 41 : return NextI;
996 : }
997 :
998 : static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
999 : // Convert the byte-offset used by unscaled into an "element" offset used
1000 : // by the scaled pair load/store instructions.
1001 8193 : if (IsUnscaled) {
1002 : // If the byte-offset isn't a multiple of the stride, there's no point
1003 : // trying to match it.
1004 303 : if (Offset % OffsetStride)
1005 : return false;
1006 217 : Offset /= OffsetStride;
1007 : }
1008 8107 : return Offset <= 63 && Offset >= -64;
1009 : }
1010 :
1011 : // Do alignment, specialized to power of 2 and for signed ints,
1012 : // avoiding having to do a C-style cast from uint_64t to int when
1013 : // using alignTo from include/llvm/Support/MathExtras.h.
1014 : // FIXME: Move this function to include/MathExtras.h?
1015 : static int alignTo(int Num, int PowOf2) {
1016 81 : return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1017 : }
1018 :
1019 3421 : static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
1020 : AliasAnalysis *AA) {
1021 : // One of the instructions must modify memory.
1022 3421 : if (!MIa.mayStore() && !MIb.mayStore())
1023 : return false;
1024 :
1025 : // Both instructions must be memory operations.
1026 3370 : if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
1027 : return false;
1028 :
1029 3370 : return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
1030 : }
1031 :
1032 968 : static bool mayAlias(MachineInstr &MIa,
1033 : SmallVectorImpl<MachineInstr *> &MemInsns,
1034 : AliasAnalysis *AA) {
1035 1132 : for (MachineInstr *MIb : MemInsns)
1036 191 : if (mayAlias(MIa, *MIb, AA))
1037 : return true;
1038 :
1039 : return false;
1040 : }
1041 :
1042 0 : bool AArch64LoadStoreOpt::findMatchingStore(
1043 : MachineBasicBlock::iterator I, unsigned Limit,
1044 : MachineBasicBlock::iterator &StoreI) {
1045 0 : MachineBasicBlock::iterator B = I->getParent()->begin();
1046 0 : MachineBasicBlock::iterator MBBI = I;
1047 : MachineInstr &LoadMI = *I;
1048 0 : unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
1049 :
1050 : // If the load is the first instruction in the block, there's obviously
1051 : // not any matching store.
1052 0 : if (MBBI == B)
1053 0 : return false;
1054 :
1055 : // Track which register units have been modified and used between the first
1056 : // insn and the second insn.
1057 : ModifiedRegUnits.clear();
1058 : UsedRegUnits.clear();
1059 :
1060 : unsigned Count = 0;
1061 : do {
1062 : --MBBI;
1063 : MachineInstr &MI = *MBBI;
1064 :
1065 : // Don't count transient instructions towards the search limit since there
1066 : // may be different numbers of them if e.g. debug information is present.
1067 : if (!MI.isTransient())
1068 0 : ++Count;
1069 :
1070 : // If the load instruction reads directly from the address to which the
1071 : // store instruction writes and the stored value is not modified, we can
1072 : // promote the load. Since we do not handle stores with pre-/post-index,
1073 : // it's unnecessary to check if BaseReg is modified by the store itself.
1074 0 : if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1075 0 : BaseReg == getLdStBaseOp(MI).getReg() &&
1076 0 : isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1077 0 : ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1078 0 : StoreI = MBBI;
1079 0 : return true;
1080 : }
1081 :
1082 0 : if (MI.isCall())
1083 0 : return false;
1084 :
1085 : // Update modified / uses register units.
1086 0 : LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1087 :
1088 : // Otherwise, if the base register is modified, we have no match, so
1089 : // return early.
1090 0 : if (!ModifiedRegUnits.available(BaseReg))
1091 0 : return false;
1092 :
1093 : // If we encounter a store aliased with the load, return early.
1094 0 : if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
1095 0 : return false;
1096 0 : } while (MBBI != B && Count < Limit);
1097 : return false;
1098 : }
1099 :
1100 : // Returns true if FirstMI and MI are candidates for merging or pairing.
1101 : // Otherwise, returns false.
1102 0 : static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1103 : LdStPairFlags &Flags,
1104 : const AArch64InstrInfo *TII) {
1105 : // If this is volatile or if pairing is suppressed, not a candidate.
1106 0 : if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1107 0 : return false;
1108 :
1109 : // We should have already checked FirstMI for pair suppression and volatility.
1110 : assert(!FirstMI.hasOrderedMemoryRef() &&
1111 : !TII->isLdStPairSuppressed(FirstMI) &&
1112 : "FirstMI shouldn't get here if either of these checks are true.");
1113 :
1114 0 : unsigned OpcA = FirstMI.getOpcode();
1115 0 : unsigned OpcB = MI.getOpcode();
1116 :
1117 : // Opcodes match: nothing more to check.
1118 0 : if (OpcA == OpcB)
1119 0 : return true;
1120 :
1121 : // Try to match a sign-extended load/store with a zero-extended load/store.
1122 : bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1123 0 : unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1124 : assert(IsValidLdStrOpc &&
1125 : "Given Opc should be a Load or Store with an immediate");
1126 : // OpcA will be the first instruction in the pair.
1127 0 : if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1128 0 : Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1129 0 : return true;
1130 : }
1131 :
1132 : // If the second instruction isn't even a mergable/pairable load/store, bail
1133 : // out.
1134 0 : if (!PairIsValidLdStrOpc)
1135 0 : return false;
1136 :
1137 : // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1138 : // offsets.
1139 : if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1140 0 : return false;
1141 :
1142 : // Try to match an unscaled load/store with a scaled load/store.
1143 0 : return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
1144 0 : getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1145 :
1146 : // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1147 : }
1148 :
1149 : /// Scan the instructions looking for a load/store that can be combined with the
1150 : /// current instruction into a wider equivalent or a load/store pair.
1151 : MachineBasicBlock::iterator
1152 6601 : AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1153 : LdStPairFlags &Flags, unsigned Limit,
1154 : bool FindNarrowMerge) {
1155 6601 : MachineBasicBlock::iterator E = I->getParent()->end();
1156 : MachineBasicBlock::iterator MBBI = I;
1157 : MachineInstr &FirstMI = *I;
1158 : ++MBBI;
1159 :
1160 6601 : bool MayLoad = FirstMI.mayLoad();
1161 : bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
1162 6601 : unsigned Reg = getLdStRegOp(FirstMI).getReg();
1163 6601 : unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
1164 6601 : int Offset = getLdStOffsetOp(FirstMI).getImm();
1165 6601 : int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
1166 : bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1167 :
1168 : // Track which register units have been modified and used between the first
1169 : // insn (inclusive) and the second insn.
1170 : ModifiedRegUnits.clear();
1171 : UsedRegUnits.clear();
1172 :
1173 : // Remember any instructions that read/write memory between FirstMI and MI.
1174 : SmallVector<MachineInstr *, 4> MemInsns;
1175 :
1176 29977 : for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1177 : MachineInstr &MI = *MBBI;
1178 :
1179 : // Don't count transient instructions towards the search limit since there
1180 : // may be different numbers of them if e.g. debug information is present.
1181 : if (!MI.isTransient())
1182 25346 : ++Count;
1183 :
1184 : Flags.setSExtIdx(-1);
1185 32128 : if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1186 : getLdStOffsetOp(MI).isImm()) {
1187 : assert(MI.mayLoadOrStore() && "Expected memory operation.");
1188 : // If we've found another instruction with the same opcode, check to see
1189 : // if the base and offset are compatible with our starting instruction.
1190 : // These instructions all have scaled immediate operands, so we just
1191 : // check for +1/-1. Make sure to check the new instruction offset is
1192 : // actually an immediate and not a symbolic reference destined for
1193 : // a relocation.
1194 5701 : unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
1195 5701 : int MIOffset = getLdStOffsetOp(MI).getImm();
1196 : bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
1197 5701 : if (IsUnscaled != MIIsUnscaled) {
1198 : // We're trying to pair instructions that differ in how they are scaled.
1199 : // If FirstMI is scaled then scale the offset of MI accordingly.
1200 : // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1201 53 : int MemSize = getMemScale(MI);
1202 53 : if (MIIsUnscaled) {
1203 : // If the unscaled offset isn't a multiple of the MemSize, we can't
1204 : // pair the operations together: bail and keep looking.
1205 12 : if (MIOffset % MemSize) {
1206 6 : LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1207 3 : UsedRegUnits, TRI);
1208 3 : MemInsns.push_back(&MI);
1209 3 : continue;
1210 : }
1211 9 : MIOffset /= MemSize;
1212 : } else {
1213 41 : MIOffset *= MemSize;
1214 : }
1215 : }
1216 :
1217 5698 : if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
1218 4123 : (Offset + OffsetStride == MIOffset))) {
1219 1585 : int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1220 1585 : if (FindNarrowMerge) {
1221 : // If the alignment requirements of the scaled wide load/store
1222 : // instruction can't express the offset of the scaled narrow input,
1223 : // bail and keep looking. For promotable zero stores, allow only when
1224 : // the stored value is the same (i.e., WZR).
1225 22 : if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1226 21 : (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1227 14 : LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1228 7 : UsedRegUnits, TRI);
1229 7 : MemInsns.push_back(&MI);
1230 7 : continue;
1231 : }
1232 : } else {
1233 : // Pairwise instructions have a 7-bit signed offset field. Single
1234 : // insns have a 12-bit unsigned offset field. If the resultant
1235 : // immediate offset of merging these instructions is out of range for
1236 : // a pairwise instruction, bail and keep looking.
1237 1563 : if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1238 0 : LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1239 0 : UsedRegUnits, TRI);
1240 0 : MemInsns.push_back(&MI);
1241 0 : continue;
1242 : }
1243 : // If the alignment requirements of the paired (scaled) instruction
1244 : // can't express the offset of the unscaled input, bail and keep
1245 : // looking.
1246 1563 : if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1247 0 : LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1248 0 : UsedRegUnits, TRI);
1249 0 : MemInsns.push_back(&MI);
1250 0 : continue;
1251 : }
1252 : }
1253 : // If the destination register of the loads is the same register, bail
1254 : // and keep looking. A load-pair instruction with both destination
1255 : // registers the same is UNPREDICTABLE and will result in an exception.
1256 1578 : if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
1257 212 : LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
1258 : TRI);
1259 212 : MemInsns.push_back(&MI);
1260 212 : continue;
1261 : }
1262 :
1263 : // If the Rt of the second instruction was not modified or used between
1264 : // the two instructions and none of the instructions between the second
1265 : // and first alias with the second, we can combine the second into the
1266 : // first.
1267 2339 : if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
1268 1478 : !(MI.mayLoad() &&
1269 2788 : !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
1270 917 : !mayAlias(MI, MemInsns, AA)) {
1271 : Flags.setMergeForward(false);
1272 902 : return MBBI;
1273 : }
1274 :
1275 : // Likewise, if the Rt of the first instruction is not modified or used
1276 : // between the two instructions and none of the instructions between the
1277 : // first and the second alias with the first, we can combine the first
1278 : // into the second.
1279 565 : if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg()) &&
1280 58 : !(MayLoad &&
1281 573 : !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
1282 51 : !mayAlias(FirstMI, MemInsns, AA)) {
1283 : Flags.setMergeForward(true);
1284 39 : return MBBI;
1285 : }
1286 : // Unable to combine these instructions due to interference in between.
1287 : // Keep looking.
1288 : }
1289 : }
1290 :
1291 : // If the instruction wasn't a matching load or store. Stop searching if we
1292 : // encounter a call instruction that might modify memory.
1293 25253 : if (MI.isCall())
1294 1040 : return E;
1295 :
1296 : // Update modified / uses register units.
1297 24213 : LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1298 :
1299 : // Otherwise, if the base register is modified, we have no match, so
1300 : // return early.
1301 24213 : if (!ModifiedRegUnits.available(BaseReg))
1302 1059 : return E;
1303 :
1304 : // Update list of instructions that read/write memory.
1305 23154 : if (MI.mayLoadOrStore())
1306 12506 : MemInsns.push_back(&MI);
1307 : }
1308 3561 : return E;
1309 : }
1310 :
1311 : MachineBasicBlock::iterator
1312 199 : AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1313 : MachineBasicBlock::iterator Update,
1314 : bool IsPreIdx) {
1315 : assert((Update->getOpcode() == AArch64::ADDXri ||
1316 : Update->getOpcode() == AArch64::SUBXri) &&
1317 : "Unexpected base register update instruction to merge!");
1318 : MachineBasicBlock::iterator NextI = I;
1319 : // Return the instruction following the merged instruction, which is
1320 : // the instruction following our unmerged load. Unless that's the add/sub
1321 : // instruction we're merging, in which case it's the one after that.
1322 199 : if (++NextI == Update)
1323 : ++NextI;
1324 :
1325 199 : int Value = Update->getOperand(2).getImm();
1326 : assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1327 : "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1328 398 : if (Update->getOpcode() == AArch64::SUBXri)
1329 51 : Value = -Value;
1330 :
1331 199 : unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1332 218 : : getPostIndexedOpcode(I->getOpcode());
1333 : MachineInstrBuilder MIB;
1334 199 : if (!isPairedLdSt(*I)) {
1335 : // Non-paired instruction.
1336 328 : MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1337 : .add(getLdStRegOp(*Update))
1338 : .add(getLdStRegOp(*I))
1339 : .add(getLdStBaseOp(*I))
1340 164 : .addImm(Value)
1341 164 : .setMemRefs(I->memoperands())
1342 164 : .setMIFlags(I->mergeFlagsWith(*Update));
1343 : } else {
1344 : // Paired instruction.
1345 35 : int Scale = getMemScale(*I);
1346 70 : MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1347 : .add(getLdStRegOp(*Update))
1348 : .add(getLdStRegOp(*I, 0))
1349 : .add(getLdStRegOp(*I, 1))
1350 : .add(getLdStBaseOp(*I))
1351 35 : .addImm(Value / Scale)
1352 35 : .setMemRefs(I->memoperands())
1353 35 : .setMIFlags(I->mergeFlagsWith(*Update));
1354 : }
1355 : (void)MIB;
1356 :
1357 : if (IsPreIdx) {
1358 : ++NumPreFolded;
1359 : LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1360 : } else {
1361 : ++NumPostFolded;
1362 : LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1363 : }
1364 : LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
1365 : LLVM_DEBUG(I->print(dbgs()));
1366 : LLVM_DEBUG(dbgs() << " ");
1367 : LLVM_DEBUG(Update->print(dbgs()));
1368 : LLVM_DEBUG(dbgs() << " with instruction:\n ");
1369 : LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1370 : LLVM_DEBUG(dbgs() << "\n");
1371 :
1372 : // Erase the old instructions for the block.
1373 199 : I->eraseFromParent();
1374 199 : Update->eraseFromParent();
1375 :
1376 199 : return NextI;
1377 : }
1378 :
1379 0 : bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1380 : MachineInstr &MI,
1381 : unsigned BaseReg, int Offset) {
1382 0 : switch (MI.getOpcode()) {
1383 : default:
1384 : break;
1385 0 : case AArch64::SUBXri:
1386 : case AArch64::ADDXri:
1387 : // Make sure it's a vanilla immediate operand, not a relocation or
1388 : // anything else we can't handle.
1389 0 : if (!MI.getOperand(2).isImm())
1390 : break;
1391 : // Watch out for 1 << 12 shifted value.
1392 0 : if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
1393 : break;
1394 :
1395 : // The update instruction source and destination register must be the
1396 : // same as the load/store base register.
1397 0 : if (MI.getOperand(0).getReg() != BaseReg ||
1398 0 : MI.getOperand(1).getReg() != BaseReg)
1399 : break;
1400 :
1401 0 : bool IsPairedInsn = isPairedLdSt(MemMI);
1402 0 : int UpdateOffset = MI.getOperand(2).getImm();
1403 0 : if (MI.getOpcode() == AArch64::SUBXri)
1404 0 : UpdateOffset = -UpdateOffset;
1405 :
1406 : // For non-paired load/store instructions, the immediate must fit in a
1407 : // signed 9-bit integer.
1408 0 : if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
1409 : break;
1410 :
1411 : // For paired load/store instructions, the immediate must be a multiple of
1412 : // the scaling factor. The scaled offset must also fit into a signed 7-bit
1413 : // integer.
1414 0 : if (IsPairedInsn) {
1415 0 : int Scale = getMemScale(MemMI);
1416 0 : if (UpdateOffset % Scale != 0)
1417 : break;
1418 :
1419 0 : int ScaledOffset = UpdateOffset / Scale;
1420 0 : if (ScaledOffset > 63 || ScaledOffset < -64)
1421 : break;
1422 : }
1423 :
1424 : // If we have a non-zero Offset, we check that it matches the amount
1425 : // we're adding to the register.
1426 0 : if (!Offset || Offset == UpdateOffset)
1427 0 : return true;
1428 : break;
1429 : }
1430 : return false;
1431 : }
1432 :
1433 23813 : MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1434 : MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1435 23813 : MachineBasicBlock::iterator E = I->getParent()->end();
1436 : MachineInstr &MemMI = *I;
1437 : MachineBasicBlock::iterator MBBI = I;
1438 :
1439 23813 : unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1440 23813 : int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1441 :
1442 : // Scan forward looking for post-index opportunities. Updating instructions
1443 : // can't be formed if the memory instruction doesn't have the offset we're
1444 : // looking for.
1445 23813 : if (MIUnscaledOffset != UnscaledOffset)
1446 7647 : return E;
1447 :
1448 : // If the base register overlaps a destination register, we can't
1449 : // merge the update.
1450 : bool IsPairedInsn = isPairedLdSt(MemMI);
1451 35251 : for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1452 19439 : unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1453 19439 : if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1454 354 : return E;
1455 : }
1456 :
1457 : // Track which register units have been modified and used between the first
1458 : // insn (inclusive) and the second insn.
1459 : ModifiedRegUnits.clear();
1460 : UsedRegUnits.clear();
1461 : ++MBBI;
1462 41374 : for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1463 : MachineInstr &MI = *MBBI;
1464 :
1465 : // Don't count transient instructions towards the search limit since there
1466 : // may be different numbers of them if e.g. debug information is present.
1467 : if (!MI.isTransient())
1468 32994 : ++Count;
1469 :
1470 : // If we found a match, return it.
1471 34933 : if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1472 146 : return MBBI;
1473 :
1474 : // Update the status of what the instruction clobbered and used.
1475 34787 : LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1476 :
1477 : // Otherwise, if the base register is used or modified, we have no match, so
1478 : // return early.
1479 67547 : if (!ModifiedRegUnits.available(BaseReg) ||
1480 32760 : !UsedRegUnits.available(BaseReg))
1481 9225 : return E;
1482 : }
1483 6441 : return E;
1484 : }
1485 :
1486 11749 : MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1487 : MachineBasicBlock::iterator I, unsigned Limit) {
1488 11749 : MachineBasicBlock::iterator B = I->getParent()->begin();
1489 : MachineBasicBlock::iterator E = I->getParent()->end();
1490 : MachineInstr &MemMI = *I;
1491 11749 : MachineBasicBlock::iterator MBBI = I;
1492 :
1493 11749 : unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1494 11749 : int Offset = getLdStOffsetOp(MemMI).getImm();
1495 :
1496 : // If the load/store is the first instruction in the block, there's obviously
1497 : // not any matching update. Ditto if the memory offset isn't zero.
1498 11749 : if (MBBI == B || Offset != 0)
1499 8790 : return E;
1500 : // If the base register overlaps a destination register, we can't
1501 : // merge the update.
1502 : bool IsPairedInsn = isPairedLdSt(MemMI);
1503 6061 : for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1504 3226 : unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1505 3226 : if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1506 124 : return E;
1507 : }
1508 :
1509 : // Track which register units have been modified and used between the first
1510 : // insn (inclusive) and the second insn.
1511 : ModifiedRegUnits.clear();
1512 : UsedRegUnits.clear();
1513 : unsigned Count = 0;
1514 : do {
1515 : --MBBI;
1516 : MachineInstr &MI = *MBBI;
1517 :
1518 : // Don't count transient instructions towards the search limit since there
1519 : // may be different numbers of them if e.g. debug information is present.
1520 : if (!MI.isTransient())
1521 5456 : ++Count;
1522 :
1523 : // If we found a match, return it.
1524 6230 : if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
1525 53 : return MBBI;
1526 :
1527 : // Update the status of what the instruction clobbered and used.
1528 6177 : LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1529 :
1530 : // Otherwise, if the base register is used or modified, we have no match, so
1531 : // return early.
1532 11716 : if (!ModifiedRegUnits.available(BaseReg) ||
1533 5539 : !UsedRegUnits.available(BaseReg))
1534 1290 : return E;
1535 4887 : } while (MBBI != B && Count < Limit);
1536 1492 : return E;
1537 : }
1538 :
1539 3781 : bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1540 : MachineBasicBlock::iterator &MBBI) {
1541 : MachineInstr &MI = *MBBI;
1542 : // If this is a volatile load, don't mess with it.
1543 3781 : if (MI.hasOrderedMemoryRef())
1544 : return false;
1545 :
1546 : // Make sure this is a reg+imm.
1547 : // FIXME: It is possible to extend it to handle reg+reg cases.
1548 1935 : if (!getLdStOffsetOp(MI).isImm())
1549 : return false;
1550 :
1551 : // Look backward up to LdStLimit instructions.
1552 : MachineBasicBlock::iterator StoreI;
1553 1651 : if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
1554 : ++NumLoadsFromStoresPromoted;
1555 : // Promote the load. Keeping the iterator straight is a
1556 : // pain, so we let the merge routine tell us what the next instruction
1557 : // is after it's done mucking about.
1558 43 : MBBI = promoteLoadFromStore(MBBI, StoreI);
1559 43 : return true;
1560 : }
1561 : return false;
1562 : }
1563 :
1564 : // Merge adjacent zero stores into a wider store.
1565 100 : bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
1566 : MachineBasicBlock::iterator &MBBI) {
1567 : assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
1568 : MachineInstr &MI = *MBBI;
1569 100 : MachineBasicBlock::iterator E = MI.getParent()->end();
1570 :
1571 100 : if (!TII->isCandidateToMergeOrPair(MI))
1572 : return false;
1573 :
1574 : // Look ahead up to LdStLimit instructions for a mergable instruction.
1575 91 : LdStPairFlags Flags;
1576 : MachineBasicBlock::iterator MergeMI =
1577 91 : findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
1578 91 : if (MergeMI != E) {
1579 : ++NumZeroStoresPromoted;
1580 :
1581 : // Keeping the iterator straight is a pain, so we let the merge routine tell
1582 : // us what the next instruction is after it's done mucking about.
1583 15 : MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
1584 15 : return true;
1585 : }
1586 : return false;
1587 : }
1588 :
1589 : // Find loads and stores that can be merged into a single load or store pair
1590 : // instruction.
1591 11334 : bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
1592 : MachineInstr &MI = *MBBI;
1593 11334 : MachineBasicBlock::iterator E = MI.getParent()->end();
1594 :
1595 11334 : if (!TII->isCandidateToMergeOrPair(MI))
1596 : return false;
1597 :
1598 : // Early exit if the offset is not possible to match. (6 bits of positive
1599 : // range, plus allow an extra one in case we find a later insn that matches
1600 : // with Offset-1)
1601 : bool IsUnscaled = TII->isUnscaledLdSt(MI);
1602 6630 : int Offset = getLdStOffsetOp(MI).getImm();
1603 6630 : int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
1604 : // Allow one more for offset.
1605 6630 : if (Offset > 0)
1606 2754 : Offset -= OffsetStride;
1607 6544 : if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
1608 : return false;
1609 :
1610 : // Look ahead up to LdStLimit instructions for a pairable instruction.
1611 6510 : LdStPairFlags Flags;
1612 : MachineBasicBlock::iterator Paired =
1613 6510 : findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
1614 6510 : if (Paired != E) {
1615 : ++NumPairCreated;
1616 : if (TII->isUnscaledLdSt(MI))
1617 : ++NumUnscaledPairCreated;
1618 : // Keeping the iterator straight is a pain, so we let the merge routine tell
1619 : // us what the next instruction is after it's done mucking about.
1620 926 : MBBI = mergePairedInsns(MBBI, Paired, Flags);
1621 926 : return true;
1622 : }
1623 : return false;
1624 : }
1625 :
1626 12117 : bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
1627 : (MachineBasicBlock::iterator &MBBI) {
1628 : MachineInstr &MI = *MBBI;
1629 12117 : MachineBasicBlock::iterator E = MI.getParent()->end();
1630 : MachineBasicBlock::iterator Update;
1631 :
1632 : // Look forward to try to form a post-index instruction. For example,
1633 : // ldr x0, [x20]
1634 : // add x20, x20, #32
1635 : // merged into:
1636 : // ldr x0, [x20], #32
1637 12117 : Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
1638 12117 : if (Update != E) {
1639 : // Merge the update into the ld/st.
1640 109 : MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1641 109 : return true;
1642 : }
1643 :
1644 : // Don't know how to handle unscaled pre/post-index versions below, so bail.
1645 24016 : if (TII->isUnscaledLdSt(MI.getOpcode()))
1646 : return false;
1647 :
1648 : // Look back to try to find a pre-index instruction. For example,
1649 : // add x0, x0, #8
1650 : // ldr x1, [x0]
1651 : // merged into:
1652 : // ldr x1, [x0, #8]!
1653 11749 : Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
1654 11749 : if (Update != E) {
1655 : // Merge the update into the ld/st.
1656 53 : MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1657 53 : return true;
1658 : }
1659 :
1660 : // The immediate in the load/store is scaled by the size of the memory
1661 : // operation. The immediate in the add we're looking for,
1662 : // however, is not, so adjust here.
1663 11696 : int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1664 :
1665 : // Look forward to try to find a post-index instruction. For example,
1666 : // ldr x1, [x0, #64]
1667 : // add x0, x0, #64
1668 : // merged into:
1669 : // ldr x1, [x0, #64]!
1670 11696 : Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
1671 11696 : if (Update != E) {
1672 : // Merge the update into the ld/st.
1673 37 : MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1674 37 : return true;
1675 : }
1676 :
1677 : return false;
1678 : }
1679 :
1680 15841 : bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1681 : bool EnableNarrowZeroStOpt) {
1682 : bool Modified = false;
1683 : // Four tranformations to do here:
1684 : // 1) Find loads that directly read from stores and promote them by
1685 : // replacing with mov instructions. If the store is wider than the load,
1686 : // the load will be replaced with a bitfield extract.
1687 : // e.g.,
1688 : // str w1, [x0, #4]
1689 : // ldrh w2, [x0, #6]
1690 : // ; becomes
1691 : // str w1, [x0, #4]
1692 : // lsr w2, w1, #16
1693 15841 : for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1694 91755 : MBBI != E;) {
1695 75914 : if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
1696 : Modified = true;
1697 : else
1698 : ++MBBI;
1699 : }
1700 : // 2) Merge adjacent zero stores into a wider store.
1701 : // e.g.,
1702 : // strh wzr, [x0]
1703 : // strh wzr, [x0, #2]
1704 : // ; becomes
1705 : // str wzr, [x0]
1706 : // e.g.,
1707 : // str wzr, [x0]
1708 : // str wzr, [x0, #4]
1709 : // ; becomes
1710 : // str xzr, [x0]
1711 15841 : if (EnableNarrowZeroStOpt)
1712 15656 : for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1713 90816 : MBBI != E;) {
1714 100 : if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
1715 : Modified = true;
1716 : else
1717 : ++MBBI;
1718 : }
1719 : // 3) Find loads and stores that can be merged into a single load or store
1720 : // pair instruction.
1721 : // e.g.,
1722 : // ldr x0, [x2]
1723 : // ldr x1, [x2, #8]
1724 : // ; becomes
1725 : // ldp x0, x1, [x2]
1726 15841 : for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1727 90851 : MBBI != E;) {
1728 75010 : if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
1729 : Modified = true;
1730 : else
1731 : ++MBBI;
1732 : }
1733 : // 4) Find base register updates that can be merged into the load or store
1734 : // as a base-reg writeback.
1735 : // e.g.,
1736 : // ldr x0, [x2]
1737 : // add x2, x2, #4
1738 : // ; becomes
1739 : // ldr x0, [x2], #4
1740 15841 : for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1741 90678 : MBBI != E;) {
1742 74837 : if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
1743 : Modified = true;
1744 : else
1745 : ++MBBI;
1746 : }
1747 :
1748 15841 : return Modified;
1749 : }
1750 :
1751 13745 : bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1752 13745 : if (skipFunction(Fn.getFunction()))
1753 : return false;
1754 :
1755 13739 : Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1756 13739 : TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1757 13739 : TRI = Subtarget->getRegisterInfo();
1758 13739 : AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1759 :
1760 : // Resize the modified and used register unit trackers. We do this once
1761 : // per function and then clear the register units each time we optimize a load
1762 : // or store.
1763 13739 : ModifiedRegUnits.init(*TRI);
1764 13739 : UsedRegUnits.init(*TRI);
1765 :
1766 : bool Modified = false;
1767 13739 : bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
1768 29580 : for (auto &MBB : Fn)
1769 15841 : Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
1770 :
1771 : return Modified;
1772 : }
1773 :
1774 : // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
1775 : // stores near one another? Note: The pre-RA instruction scheduler already has
1776 : // hooks to try and schedule pairable loads/stores together to improve pairing
1777 : // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
1778 :
1779 : // FIXME: When pairing store instructions it's very possible for this pass to
1780 : // hoist a store with a KILL marker above another use (without a KILL marker).
1781 : // The resulting IR is invalid, but nothing uses the KILL markers after this
1782 : // pass, so it's never caused a problem in practice.
1783 :
1784 : /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1785 : /// load / store optimization pass.
1786 1113 : FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1787 1113 : return new AArch64LoadStoreOpt();
1788 : }
|