LLVM 22.0.0git
X86AvoidStoreForwardingBlocks.cpp
Go to the documentation of this file.
1//===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// If a load follows a store and reloads data that the store has written to
10// memory, Intel microarchitectures can in many cases forward the data directly
11// from the store to the load, This "store forwarding" saves cycles by enabling
12// the load to directly obtain the data instead of accessing the data from
13// cache or memory.
14// A "store forward block" occurs in cases that a store cannot be forwarded to
15// the load. The most typical case of store forward block on Intel Core
16// microarchitecture that a small store cannot be forwarded to a large load.
17// The estimated penalty for a store forward block is ~13 cycles.
18//
19// This pass tries to recognize and handle cases where "store forward block"
20// is created by the compiler when lowering memcpy calls to a sequence
21// of a load and a store.
22//
23// The pass currently only handles cases where memcpy is lowered to
24// XMM/YMM registers, it tries to break the memcpy into smaller copies.
25// breaking the memcpy should be possible since there is no atomicity
26// guarantee for loads and stores to XMM/YMM.
27//
28// It could be better for performance to solve the problem by loading
29// to XMM/YMM then inserting the partial store before storing back from XMM/YMM
30// to memory, but this will result in a more conservative optimization since it
31// requires we prove that all memory accesses between the blocking store and the
32// load must alias/don't alias before we can move the store, whereas the
33// transformation done here is correct regardless to other memory accesses.
34//===----------------------------------------------------------------------===//
35
36#include "X86.h"
37#include "X86InstrInfo.h"
38#include "X86Subtarget.h"
47#include "llvm/IR/DebugLoc.h"
48#include "llvm/IR/Function.h"
50#include "llvm/MC/MCInstrDesc.h"
51
52using namespace llvm;
53
54#define DEBUG_TYPE "x86-avoid-sfb"
55
57 "x86-disable-avoid-SFB", cl::Hidden,
58 cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
59
61 "x86-sfb-inspection-limit",
62 cl::desc("X86: Number of instructions backward to "
63 "inspect for store forwarding blocks."),
64 cl::init(20), cl::Hidden);
65
66namespace {
67
68using DisplacementSizeMap = std::map<int64_t, unsigned>;
69
70class X86AvoidSFBImpl {
71public:
72 X86AvoidSFBImpl(AliasAnalysis *AA) : AA(AA) {};
73 bool runOnMachineFunction(MachineFunction &MF);
74
75private:
76 MachineRegisterInfo *MRI = nullptr;
77 const X86InstrInfo *TII = nullptr;
78 const X86RegisterInfo *TRI = nullptr;
80 BlockedLoadsStoresPairs;
81 SmallVector<MachineInstr *, 2> ForRemoval;
82 AliasAnalysis *AA = nullptr;
83
84 /// Returns couples of Load then Store to memory which look
85 /// like a memcpy.
86 void findPotentiallylBlockedCopies(MachineFunction &MF);
87 /// Break the memcpy's load and store into smaller copies
88 /// such that each memory load that was blocked by a smaller store
89 /// would now be copied separately.
90 void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
91 const DisplacementSizeMap &BlockingStoresDispSizeMap);
92 /// Break a copy of size Size to smaller copies.
93 void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
94 MachineInstr *StoreInst, int64_t StDispImm,
95 int64_t LMMOffset, int64_t SMMOffset);
96
97 void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
98 MachineInstr *StoreInst, unsigned NStoreOpcode,
99 int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
100 int64_t SMMOffset);
101
102 bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
103
104 unsigned getRegSizeInBytes(MachineInstr *Inst);
105};
106
107class X86AvoidSFBLegacy : public MachineFunctionPass {
108public:
109 static char ID;
110 X86AvoidSFBLegacy() : MachineFunctionPass(ID) {}
111
112 StringRef getPassName() const override {
113 return "X86 Avoid Store Forwarding Blocks";
114 }
115
116 bool runOnMachineFunction(MachineFunction &MF) override;
117
118 void getAnalysisUsage(AnalysisUsage &AU) const override {
120 AU.addRequired<AAResultsWrapperPass>();
121 }
122};
123
124} // end anonymous namespace
125
126char X86AvoidSFBLegacy::ID = 0;
127
128INITIALIZE_PASS_BEGIN(X86AvoidSFBLegacy, DEBUG_TYPE, "Machine code sinking",
129 false, false)
131INITIALIZE_PASS_END(X86AvoidSFBLegacy, DEBUG_TYPE, "Machine code sinking",
133
135 return new X86AvoidSFBLegacy();
136}
137
138static bool isXMMLoadOpcode(unsigned Opcode) {
139 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
140 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
141 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
142 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
143 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
144 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
145 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
146 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
147}
148static bool isYMMLoadOpcode(unsigned Opcode) {
149 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
150 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
151 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
152 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
153 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
154 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
155 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
156}
157
158static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
159 return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
160}
161
162static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
163 switch (LdOpcode) {
164 case X86::MOVUPSrm:
165 case X86::MOVAPSrm:
166 return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
167 case X86::VMOVUPSrm:
168 case X86::VMOVAPSrm:
169 return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
170 case X86::VMOVUPDrm:
171 case X86::VMOVAPDrm:
172 return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
173 case X86::VMOVDQUrm:
174 case X86::VMOVDQArm:
175 return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
176 case X86::VMOVUPSZ128rm:
177 case X86::VMOVAPSZ128rm:
178 return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
179 case X86::VMOVUPDZ128rm:
180 case X86::VMOVAPDZ128rm:
181 return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
182 case X86::VMOVUPSYrm:
183 case X86::VMOVAPSYrm:
184 return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
185 case X86::VMOVUPDYrm:
186 case X86::VMOVAPDYrm:
187 return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
188 case X86::VMOVDQUYrm:
189 case X86::VMOVDQAYrm:
190 return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
191 case X86::VMOVUPSZ256rm:
192 case X86::VMOVAPSZ256rm:
193 return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
194 case X86::VMOVUPDZ256rm:
195 case X86::VMOVAPDZ256rm:
196 return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
197 case X86::VMOVDQU64Z128rm:
198 case X86::VMOVDQA64Z128rm:
199 return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
200 case X86::VMOVDQU32Z128rm:
201 case X86::VMOVDQA32Z128rm:
202 return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
203 case X86::VMOVDQU64Z256rm:
204 case X86::VMOVDQA64Z256rm:
205 return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
206 case X86::VMOVDQU32Z256rm:
207 case X86::VMOVDQA32Z256rm:
208 return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
209 default:
210 return false;
211 }
212}
213
214static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
215 bool PBlock = false;
216 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
217 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
218 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
219 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
220 if (isYMMLoadOpcode(LoadOpcode))
221 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
222 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
223 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
224 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
225 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
226 Opcode == X86::VMOVDQU64Z128mr ||
227 Opcode == X86::VMOVDQA64Z128mr ||
228 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
229 return PBlock;
230}
231
232static const int MOV128SZ = 16;
233static const int MOV64SZ = 8;
234static const int MOV32SZ = 4;
235static const int MOV16SZ = 2;
236static const int MOV8SZ = 1;
237
238static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
239 switch (LoadOpcode) {
240 case X86::VMOVUPSYrm:
241 case X86::VMOVAPSYrm:
242 return X86::VMOVUPSrm;
243 case X86::VMOVUPDYrm:
244 case X86::VMOVAPDYrm:
245 return X86::VMOVUPDrm;
246 case X86::VMOVDQUYrm:
247 case X86::VMOVDQAYrm:
248 return X86::VMOVDQUrm;
249 case X86::VMOVUPSZ256rm:
250 case X86::VMOVAPSZ256rm:
251 return X86::VMOVUPSZ128rm;
252 case X86::VMOVUPDZ256rm:
253 case X86::VMOVAPDZ256rm:
254 return X86::VMOVUPDZ128rm;
255 case X86::VMOVDQU64Z256rm:
256 case X86::VMOVDQA64Z256rm:
257 return X86::VMOVDQU64Z128rm;
258 case X86::VMOVDQU32Z256rm:
259 case X86::VMOVDQA32Z256rm:
260 return X86::VMOVDQU32Z128rm;
261 default:
262 llvm_unreachable("Unexpected Load Instruction Opcode");
263 }
264 return 0;
265}
266
267static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
268 switch (StoreOpcode) {
269 case X86::VMOVUPSYmr:
270 case X86::VMOVAPSYmr:
271 return X86::VMOVUPSmr;
272 case X86::VMOVUPDYmr:
273 case X86::VMOVAPDYmr:
274 return X86::VMOVUPDmr;
275 case X86::VMOVDQUYmr:
276 case X86::VMOVDQAYmr:
277 return X86::VMOVDQUmr;
278 case X86::VMOVUPSZ256mr:
279 case X86::VMOVAPSZ256mr:
280 return X86::VMOVUPSZ128mr;
281 case X86::VMOVUPDZ256mr:
282 case X86::VMOVAPDZ256mr:
283 return X86::VMOVUPDZ128mr;
284 case X86::VMOVDQU64Z256mr:
285 case X86::VMOVDQA64Z256mr:
286 return X86::VMOVDQU64Z128mr;
287 case X86::VMOVDQU32Z256mr:
288 case X86::VMOVDQA32Z256mr:
289 return X86::VMOVDQU32Z128mr;
290 default:
291 llvm_unreachable("Unexpected Load Instruction Opcode");
292 }
293 return 0;
294}
295
296static int getAddrOffset(const MachineInstr *MI) {
297 const MCInstrDesc &Descl = MI->getDesc();
298 int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
299 assert(AddrOffset != -1 && "Expected Memory Operand");
300 AddrOffset += X86II::getOperandBias(Descl);
301 return AddrOffset;
302}
303
305 int AddrOffset = getAddrOffset(MI);
306 return MI->getOperand(AddrOffset + X86::AddrBaseReg);
307}
308
310 int AddrOffset = getAddrOffset(MI);
311 return MI->getOperand(AddrOffset + X86::AddrDisp);
312}
313
314// Relevant addressing modes contain only base register and immediate
315// displacement or frameindex and immediate displacement.
316// TODO: Consider expanding to other addressing modes in the future
318 int AddrOffset = getAddrOffset(MI);
320 const MachineOperand &Disp = getDispOperand(MI);
321 const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
322 const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
323 const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
324
325 if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
326 return false;
327 if (!Disp.isImm())
328 return false;
329 if (Scale.getImm() != 1)
330 return false;
331 if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
332 return false;
333 if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
334 return false;
335 return true;
336}
337
338// Collect potentially blocking stores.
339// Limit the number of instructions backwards we want to inspect
340// since the effect of store block won't be visible if the store
341// and load instructions have enough instructions in between to
342// keep the core busy.
345 SmallVector<MachineInstr *, 2> PotentialBlockers;
346 unsigned BlockCount = 0;
347 const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
348 for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
349 E = LoadInst->getParent()->rend();
350 PBInst != E; ++PBInst) {
351 if (PBInst->isMetaInstruction())
352 continue;
353 BlockCount++;
354 if (BlockCount >= InspectionLimit)
355 break;
356 MachineInstr &MI = *PBInst;
357 if (MI.getDesc().isCall())
358 return PotentialBlockers;
359 PotentialBlockers.push_back(&MI);
360 }
361 // If we didn't get to the instructions limit try predecessing blocks.
362 // Ideally we should traverse the predecessor blocks in depth with some
363 // coloring algorithm, but for now let's just look at the first order
364 // predecessors.
365 if (BlockCount < InspectionLimit) {
367 int LimitLeft = InspectionLimit - BlockCount;
368 for (MachineBasicBlock *PMBB : MBB->predecessors()) {
369 int PredCount = 0;
370 for (MachineInstr &PBInst : llvm::reverse(*PMBB)) {
371 if (PBInst.isMetaInstruction())
372 continue;
373 PredCount++;
374 if (PredCount >= LimitLeft)
375 break;
376 if (PBInst.getDesc().isCall())
377 break;
378 PotentialBlockers.push_back(&PBInst);
379 }
380 }
381 }
382 return PotentialBlockers;
383}
384
385void X86AvoidSFBImpl::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
386 int64_t LoadDisp, MachineInstr *StoreInst,
387 unsigned NStoreOpcode, int64_t StoreDisp,
388 unsigned Size, int64_t LMMOffset,
389 int64_t SMMOffset) {
390 MachineOperand &LoadBase = getBaseOperand(LoadInst);
391 MachineOperand &StoreBase = getBaseOperand(StoreInst);
392 MachineBasicBlock *MBB = LoadInst->getParent();
393 MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
394 MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
395
396 Register Reg1 =
397 MRI->createVirtualRegister(TII->getRegClass(TII->get(NLoadOpcode), 0));
398 MachineInstr *NewLoad =
399 BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
400 Reg1)
401 .add(LoadBase)
402 .addImm(1)
403 .addReg(X86::NoRegister)
404 .addImm(LoadDisp)
405 .addReg(X86::NoRegister)
407 MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
408 if (LoadBase.isReg())
409 getBaseOperand(NewLoad).setIsKill(false);
410 LLVM_DEBUG(NewLoad->dump());
411 // If the load and store are consecutive, use the loadInst location to
412 // reduce register pressure.
413 MachineInstr *StInst = StoreInst;
414 auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
415 MBB->instr_begin());
416 if (PrevInstrIt.getNodePtr() == LoadInst)
417 StInst = LoadInst;
418 MachineInstr *NewStore =
419 BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
420 .add(StoreBase)
421 .addImm(1)
422 .addReg(X86::NoRegister)
423 .addImm(StoreDisp)
424 .addReg(X86::NoRegister)
425 .addReg(Reg1)
427 MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
428 if (StoreBase.isReg())
429 getBaseOperand(NewStore).setIsKill(false);
430 MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);
431 assert(StoreSrcVReg.isReg() && "Expected virtual register");
432 NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
433 LLVM_DEBUG(NewStore->dump());
434}
435
436void X86AvoidSFBImpl::buildCopies(int Size, MachineInstr *LoadInst,
437 int64_t LdDispImm, MachineInstr *StoreInst,
438 int64_t StDispImm, int64_t LMMOffset,
439 int64_t SMMOffset) {
440 int LdDisp = LdDispImm;
441 int StDisp = StDispImm;
442 while (Size > 0) {
443 if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
444 Size = Size - MOV128SZ;
445 buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
446 StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
447 StDisp, MOV128SZ, LMMOffset, SMMOffset);
448 LdDisp += MOV128SZ;
449 StDisp += MOV128SZ;
450 LMMOffset += MOV128SZ;
451 SMMOffset += MOV128SZ;
452 continue;
453 }
454 if (Size - MOV64SZ >= 0) {
455 Size = Size - MOV64SZ;
456 buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
457 MOV64SZ, LMMOffset, SMMOffset);
458 LdDisp += MOV64SZ;
459 StDisp += MOV64SZ;
460 LMMOffset += MOV64SZ;
461 SMMOffset += MOV64SZ;
462 continue;
463 }
464 if (Size - MOV32SZ >= 0) {
465 Size = Size - MOV32SZ;
466 buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
467 MOV32SZ, LMMOffset, SMMOffset);
468 LdDisp += MOV32SZ;
469 StDisp += MOV32SZ;
470 LMMOffset += MOV32SZ;
471 SMMOffset += MOV32SZ;
472 continue;
473 }
474 if (Size - MOV16SZ >= 0) {
475 Size = Size - MOV16SZ;
476 buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
477 MOV16SZ, LMMOffset, SMMOffset);
478 LdDisp += MOV16SZ;
479 StDisp += MOV16SZ;
480 LMMOffset += MOV16SZ;
481 SMMOffset += MOV16SZ;
482 continue;
483 }
484 if (Size - MOV8SZ >= 0) {
485 Size = Size - MOV8SZ;
486 buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
487 MOV8SZ, LMMOffset, SMMOffset);
488 LdDisp += MOV8SZ;
489 StDisp += MOV8SZ;
490 LMMOffset += MOV8SZ;
491 SMMOffset += MOV8SZ;
492 continue;
493 }
494 }
495 assert(Size == 0 && "Wrong size division");
496}
497
501 auto *StorePrevNonDbgInstr =
503 LoadInst->getParent()->instr_begin())
504 .getNodePtr();
505 if (LoadBase.isReg()) {
506 MachineInstr *LastLoad = LoadInst->getPrevNode();
507 // If the original load and store to xmm/ymm were consecutive
508 // then the partial copies were also created in
509 // a consecutive order to reduce register pressure,
510 // and the location of the last load is before the last store.
511 if (StorePrevNonDbgInstr == LoadInst)
512 LastLoad = LoadInst->getPrevNode()->getPrevNode();
513 getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
514 }
515 if (StoreBase.isReg()) {
516 MachineInstr *StInst = StoreInst;
517 if (StorePrevNonDbgInstr == LoadInst)
518 StInst = LoadInst;
519 getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
520 }
521}
522
523bool X86AvoidSFBImpl::alias(const MachineMemOperand &Op1,
524 const MachineMemOperand &Op2) const {
525 if (!Op1.getValue() || !Op2.getValue())
526 return true;
527
528 int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
529 int64_t Overlapa = Op1.getSize().getValue() + Op1.getOffset() - MinOffset;
530 int64_t Overlapb = Op2.getSize().getValue() + Op2.getOffset() - MinOffset;
531
532 return !AA->isNoAlias(
533 MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
534 MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
535}
536
537void X86AvoidSFBImpl::findPotentiallylBlockedCopies(MachineFunction &MF) {
538 for (auto &MBB : MF)
539 for (auto &MI : MBB) {
540 if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
541 continue;
542 Register DefVR = MI.getOperand(0).getReg();
543 if (!MRI->hasOneNonDBGUse(DefVR))
544 continue;
545 for (MachineOperand &StoreMO :
546 llvm::make_early_inc_range(MRI->use_nodbg_operands(DefVR))) {
547 MachineInstr &StoreMI = *StoreMO.getParent();
548 // Skip cases where the memcpy may overlap.
549 if (StoreMI.getParent() == MI.getParent() &&
550 isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
552 isRelevantAddressingMode(&StoreMI) &&
553 MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
554 if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
555 BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
556 }
557 }
558 }
559}
560
561unsigned X86AvoidSFBImpl::getRegSizeInBytes(MachineInstr *LoadInst) {
562 const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0);
563 return TRI->getRegSizeInBits(*TRC) / 8;
564}
565
566void X86AvoidSFBImpl::breakBlockedCopies(
567 MachineInstr *LoadInst, MachineInstr *StoreInst,
568 const DisplacementSizeMap &BlockingStoresDispSizeMap) {
569 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
570 int64_t StDispImm = getDispOperand(StoreInst).getImm();
571 int64_t LMMOffset = 0;
572 int64_t SMMOffset = 0;
573
574 int64_t LdDisp1 = LdDispImm;
575 int64_t LdDisp2 = 0;
576 int64_t StDisp1 = StDispImm;
577 int64_t StDisp2 = 0;
578 unsigned Size1 = 0;
579 unsigned Size2 = 0;
580 int64_t LdStDelta = StDispImm - LdDispImm;
581
582 for (auto DispSizePair : BlockingStoresDispSizeMap) {
583 LdDisp2 = DispSizePair.first;
584 StDisp2 = DispSizePair.first + LdStDelta;
585 Size2 = DispSizePair.second;
586 // Avoid copying overlapping areas.
587 if (LdDisp2 < LdDisp1) {
588 int OverlapDelta = LdDisp1 - LdDisp2;
589 LdDisp2 += OverlapDelta;
590 StDisp2 += OverlapDelta;
591 Size2 -= OverlapDelta;
592 }
593 Size1 = LdDisp2 - LdDisp1;
594
595 // Build a copy for the point until the current blocking store's
596 // displacement.
597 buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
598 SMMOffset);
599 // Build a copy for the current blocking store.
600 buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
601 SMMOffset + Size1);
602 LdDisp1 = LdDisp2 + Size2;
603 StDisp1 = StDisp2 + Size2;
604 LMMOffset += Size1 + Size2;
605 SMMOffset += Size1 + Size2;
606 }
607 unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
608 buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
609 LMMOffset);
610}
611
614 const MachineOperand &LoadBase = getBaseOperand(LoadInst);
615 const MachineOperand &StoreBase = getBaseOperand(StoreInst);
616 if (LoadBase.isReg() != StoreBase.isReg())
617 return false;
618 if (LoadBase.isReg())
619 return LoadBase.getReg() == StoreBase.getReg();
620 return LoadBase.getIndex() == StoreBase.getIndex();
621}
622
623static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
624 int64_t StoreDispImm, unsigned StoreSize) {
625 return ((StoreDispImm >= LoadDispImm) &&
626 (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
627}
628
629// Keep track of all stores blocking a load
630static void
631updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
632 int64_t DispImm, unsigned Size) {
633 auto [It, Inserted] = BlockingStoresDispSizeMap.try_emplace(DispImm, Size);
634 // Choose the smallest blocking store starting at this displacement.
635 if (!Inserted && It->second > Size)
636 It->second = Size;
637}
638
639// Remove blocking stores contained in each other.
640static void
641removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
642 if (BlockingStoresDispSizeMap.size() <= 1)
643 return;
644
646 for (auto DispSizePair : BlockingStoresDispSizeMap) {
647 int64_t CurrDisp = DispSizePair.first;
648 unsigned CurrSize = DispSizePair.second;
649 while (DispSizeStack.size()) {
650 int64_t PrevDisp = DispSizeStack.back().first;
651 unsigned PrevSize = DispSizeStack.back().second;
652 if (CurrDisp + CurrSize > PrevDisp + PrevSize)
653 break;
654 DispSizeStack.pop_back();
655 }
656 DispSizeStack.push_back(DispSizePair);
657 }
658 BlockingStoresDispSizeMap.clear();
659 for (auto Disp : DispSizeStack)
660 BlockingStoresDispSizeMap.insert(Disp);
661}
662
663bool X86AvoidSFBImpl::runOnMachineFunction(MachineFunction &MF) {
664 bool Changed = false;
665
667 !MF.getSubtarget<X86Subtarget>().is64Bit())
668 return false;
669
670 MRI = &MF.getRegInfo();
671 assert(MRI->isSSA() && "Expected MIR to be in SSA form");
672 TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
673 TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
674 LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
675 // Look for a load then a store to XMM/YMM which look like a memcpy
676 findPotentiallylBlockedCopies(MF);
677
678 for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
679 MachineInstr *LoadInst = LoadStoreInstPair.first;
680 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
681 DisplacementSizeMap BlockingStoresDispSizeMap;
682
683 SmallVector<MachineInstr *, 2> PotentialBlockers =
684 findPotentialBlockers(LoadInst);
685 for (auto *PBInst : PotentialBlockers) {
686 if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
687 LoadInst->getOpcode()) ||
688 !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
689 continue;
690 int64_t PBstDispImm = getDispOperand(PBInst).getImm();
691 unsigned PBstSize = (*PBInst->memoperands_begin())->getSize().getValue();
692 // This check doesn't cover all cases, but it will suffice for now.
693 // TODO: take branch probability into consideration, if the blocking
694 // store is in an unreached block, breaking the memcopy could lose
695 // performance.
696 if (hasSameBaseOpValue(LoadInst, PBInst) &&
697 isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
698 PBstSize))
699 updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
700 PBstSize);
701 }
702
703 if (BlockingStoresDispSizeMap.empty())
704 continue;
705
706 // We found a store forward block, break the memcpy's load and store
707 // into smaller copies such that each smaller store that was causing
708 // a store block would now be copied separately.
709 MachineInstr *StoreInst = LoadStoreInstPair.second;
710 LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
711 LLVM_DEBUG(LoadInst->dump());
712 LLVM_DEBUG(StoreInst->dump());
713 LLVM_DEBUG(dbgs() << "Replaced with:\n");
714 removeRedundantBlockingStores(BlockingStoresDispSizeMap);
715 breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
716 updateKillStatus(LoadInst, StoreInst);
717 ForRemoval.push_back(LoadInst);
718 ForRemoval.push_back(StoreInst);
719 }
720 for (auto *RemovedInst : ForRemoval) {
721 RemovedInst->eraseFromParent();
722 }
723 ForRemoval.clear();
724 BlockedLoadsStoresPairs.clear();
725 LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
726
727 return Changed;
728}
729
730bool X86AvoidSFBLegacy::runOnMachineFunction(MachineFunction &MF) {
731 if (skipFunction(MF.getFunction()))
732 return false;
733 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
734 X86AvoidSFBImpl Impl(AA);
735 return Impl.runOnMachineFunction(MF);
736}
737
738PreservedAnalyses
743 .getManager()
744 .getResult<AAManager>(MF.getFunction());
745 X86AvoidSFBImpl Impl(AA);
746 bool Changed = Impl.runOnMachineFunction(MF);
749}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode)
static bool isPotentialBlockedMemCpyLd(unsigned Opcode)
static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode)
static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode)
static const int MOV64SZ
static const int MOV8SZ
static bool isXMMLoadOpcode(unsigned Opcode)
static int getAddrOffset(const MachineInstr *MI)
static cl::opt< unsigned > X86AvoidSFBInspectionLimit("x86-sfb-inspection-limit", cl::desc("X86: Number of instructions backward to " "inspect for store forwarding blocks."), cl::init(20), cl::Hidden)
static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize, int64_t StoreDispImm, unsigned StoreSize)
static bool isRelevantAddressingMode(MachineInstr *MI)
static cl::opt< bool > DisableX86AvoidStoreForwardBlocks("x86-disable-avoid-SFB", cl::Hidden, cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false))
static void removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap)
static const int MOV16SZ
static bool hasSameBaseOpValue(MachineInstr *LoadInst, MachineInstr *StoreInst)
static void updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap, int64_t DispImm, unsigned Size)
static MachineOperand & getBaseOperand(MachineInstr *MI)
static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode)
static SmallVector< MachineInstr *, 2 > findPotentialBlockers(MachineInstr *LoadInst)
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst)
static const int MOV32SZ
static MachineOperand & getDispOperand(MachineInstr *MI)
static bool isYMMLoadOpcode(unsigned Opcode)
static const int MOV128SZ
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
An instruction for reading from memory.
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
const ParentTy * getParent() const
Definition ilist_node.h:34
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
int getMemoryOperandNo(uint64_t TSFlags)
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
@ AddrNumOperands
Definition X86BaseInfo.h:36
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createX86AvoidStoreForwardingBlocksLegacyPass()
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It, then continue decrementing it while it points to a debug instruction.