LLVM 20.0.0git
X86AvoidStoreForwardingBlocks.cpp
Go to the documentation of this file.
1//===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// If a load follows a store and reloads data that the store has written to
10// memory, Intel microarchitectures can in many cases forward the data directly
11// from the store to the load, This "store forwarding" saves cycles by enabling
12// the load to directly obtain the data instead of accessing the data from
13// cache or memory.
14// A "store forward block" occurs in cases that a store cannot be forwarded to
15// the load. The most typical case of store forward block on Intel Core
16// microarchitecture that a small store cannot be forwarded to a large load.
17// The estimated penalty for a store forward block is ~13 cycles.
18//
19// This pass tries to recognize and handle cases where "store forward block"
20// is created by the compiler when lowering memcpy calls to a sequence
21// of a load and a store.
22//
23// The pass currently only handles cases where memcpy is lowered to
24// XMM/YMM registers, it tries to break the memcpy into smaller copies.
25// breaking the memcpy should be possible since there is no atomicity
26// guarantee for loads and stores to XMM/YMM.
27//
28// It could be better for performance to solve the problem by loading
29// to XMM/YMM then inserting the partial store before storing back from XMM/YMM
30// to memory, but this will result in a more conservative optimization since it
31// requires we prove that all memory accesses between the blocking store and the
32// load must alias/don't alias before we can move the store, whereas the
33// transformation done here is correct regardless to other memory accesses.
34//===----------------------------------------------------------------------===//
35
36#include "X86.h"
37#include "X86InstrInfo.h"
38#include "X86Subtarget.h"
48#include "llvm/IR/DebugLoc.h"
49#include "llvm/IR/Function.h"
51#include "llvm/MC/MCInstrDesc.h"
52
53using namespace llvm;
54
55#define DEBUG_TYPE "x86-avoid-SFB"
56
58 "x86-disable-avoid-SFB", cl::Hidden,
59 cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
60
62 "x86-sfb-inspection-limit",
63 cl::desc("X86: Number of instructions backward to "
64 "inspect for store forwarding blocks."),
65 cl::init(20), cl::Hidden);
66
67namespace {
68
69using DisplacementSizeMap = std::map<int64_t, unsigned>;
70
71class X86AvoidSFBPass : public MachineFunctionPass {
72public:
73 static char ID;
74 X86AvoidSFBPass() : MachineFunctionPass(ID) { }
75
76 StringRef getPassName() const override {
77 return "X86 Avoid Store Forwarding Blocks";
78 }
79
80 bool runOnMachineFunction(MachineFunction &MF) override;
81
82 void getAnalysisUsage(AnalysisUsage &AU) const override {
85 }
86
87private:
88 MachineRegisterInfo *MRI = nullptr;
89 const X86InstrInfo *TII = nullptr;
90 const X86RegisterInfo *TRI = nullptr;
92 BlockedLoadsStoresPairs;
94 AliasAnalysis *AA = nullptr;
95
96 /// Returns couples of Load then Store to memory which look
97 /// like a memcpy.
98 void findPotentiallylBlockedCopies(MachineFunction &MF);
99 /// Break the memcpy's load and store into smaller copies
100 /// such that each memory load that was blocked by a smaller store
101 /// would now be copied separately.
102 void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
103 const DisplacementSizeMap &BlockingStoresDispSizeMap);
104 /// Break a copy of size Size to smaller copies.
105 void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
106 MachineInstr *StoreInst, int64_t StDispImm,
107 int64_t LMMOffset, int64_t SMMOffset);
108
109 void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
110 MachineInstr *StoreInst, unsigned NStoreOpcode,
111 int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
112 int64_t SMMOffset);
113
114 bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
115
116 unsigned getRegSizeInBytes(MachineInstr *Inst);
117};
118
119} // end anonymous namespace
120
121char X86AvoidSFBPass::ID = 0;
122
123INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
124 false, false)
127 false)
128
130 return new X86AvoidSFBPass();
131}
132
133static bool isXMMLoadOpcode(unsigned Opcode) {
134 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
135 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
136 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
137 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
138 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
139 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
140 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
141 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
142}
143static bool isYMMLoadOpcode(unsigned Opcode) {
144 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
145 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
146 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
147 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
148 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
149 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
150 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
151}
152
153static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
154 return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
155}
156
157static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
158 switch (LdOpcode) {
159 case X86::MOVUPSrm:
160 case X86::MOVAPSrm:
161 return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
162 case X86::VMOVUPSrm:
163 case X86::VMOVAPSrm:
164 return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
165 case X86::VMOVUPDrm:
166 case X86::VMOVAPDrm:
167 return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
168 case X86::VMOVDQUrm:
169 case X86::VMOVDQArm:
170 return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
171 case X86::VMOVUPSZ128rm:
172 case X86::VMOVAPSZ128rm:
173 return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
174 case X86::VMOVUPDZ128rm:
175 case X86::VMOVAPDZ128rm:
176 return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
177 case X86::VMOVUPSYrm:
178 case X86::VMOVAPSYrm:
179 return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
180 case X86::VMOVUPDYrm:
181 case X86::VMOVAPDYrm:
182 return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
183 case X86::VMOVDQUYrm:
184 case X86::VMOVDQAYrm:
185 return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
186 case X86::VMOVUPSZ256rm:
187 case X86::VMOVAPSZ256rm:
188 return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
189 case X86::VMOVUPDZ256rm:
190 case X86::VMOVAPDZ256rm:
191 return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
192 case X86::VMOVDQU64Z128rm:
193 case X86::VMOVDQA64Z128rm:
194 return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
195 case X86::VMOVDQU32Z128rm:
196 case X86::VMOVDQA32Z128rm:
197 return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
198 case X86::VMOVDQU64Z256rm:
199 case X86::VMOVDQA64Z256rm:
200 return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
201 case X86::VMOVDQU32Z256rm:
202 case X86::VMOVDQA32Z256rm:
203 return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
204 default:
205 return false;
206 }
207}
208
209static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
210 bool PBlock = false;
211 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
212 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
213 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
214 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
215 if (isYMMLoadOpcode(LoadOpcode))
216 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
217 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
218 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
219 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
220 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
221 Opcode == X86::VMOVDQU64Z128mr ||
222 Opcode == X86::VMOVDQA64Z128mr ||
223 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
224 return PBlock;
225}
226
227static const int MOV128SZ = 16;
228static const int MOV64SZ = 8;
229static const int MOV32SZ = 4;
230static const int MOV16SZ = 2;
231static const int MOV8SZ = 1;
232
233static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
234 switch (LoadOpcode) {
235 case X86::VMOVUPSYrm:
236 case X86::VMOVAPSYrm:
237 return X86::VMOVUPSrm;
238 case X86::VMOVUPDYrm:
239 case X86::VMOVAPDYrm:
240 return X86::VMOVUPDrm;
241 case X86::VMOVDQUYrm:
242 case X86::VMOVDQAYrm:
243 return X86::VMOVDQUrm;
244 case X86::VMOVUPSZ256rm:
245 case X86::VMOVAPSZ256rm:
246 return X86::VMOVUPSZ128rm;
247 case X86::VMOVUPDZ256rm:
248 case X86::VMOVAPDZ256rm:
249 return X86::VMOVUPDZ128rm;
250 case X86::VMOVDQU64Z256rm:
251 case X86::VMOVDQA64Z256rm:
252 return X86::VMOVDQU64Z128rm;
253 case X86::VMOVDQU32Z256rm:
254 case X86::VMOVDQA32Z256rm:
255 return X86::VMOVDQU32Z128rm;
256 default:
257 llvm_unreachable("Unexpected Load Instruction Opcode");
258 }
259 return 0;
260}
261
262static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
263 switch (StoreOpcode) {
264 case X86::VMOVUPSYmr:
265 case X86::VMOVAPSYmr:
266 return X86::VMOVUPSmr;
267 case X86::VMOVUPDYmr:
268 case X86::VMOVAPDYmr:
269 return X86::VMOVUPDmr;
270 case X86::VMOVDQUYmr:
271 case X86::VMOVDQAYmr:
272 return X86::VMOVDQUmr;
273 case X86::VMOVUPSZ256mr:
274 case X86::VMOVAPSZ256mr:
275 return X86::VMOVUPSZ128mr;
276 case X86::VMOVUPDZ256mr:
277 case X86::VMOVAPDZ256mr:
278 return X86::VMOVUPDZ128mr;
279 case X86::VMOVDQU64Z256mr:
280 case X86::VMOVDQA64Z256mr:
281 return X86::VMOVDQU64Z128mr;
282 case X86::VMOVDQU32Z256mr:
283 case X86::VMOVDQA32Z256mr:
284 return X86::VMOVDQU32Z128mr;
285 default:
286 llvm_unreachable("Unexpected Load Instruction Opcode");
287 }
288 return 0;
289}
290
291static int getAddrOffset(const MachineInstr *MI) {
292 const MCInstrDesc &Descl = MI->getDesc();
293 int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
294 assert(AddrOffset != -1 && "Expected Memory Operand");
295 AddrOffset += X86II::getOperandBias(Descl);
296 return AddrOffset;
297}
298
300 int AddrOffset = getAddrOffset(MI);
301 return MI->getOperand(AddrOffset + X86::AddrBaseReg);
302}
303
305 int AddrOffset = getAddrOffset(MI);
306 return MI->getOperand(AddrOffset + X86::AddrDisp);
307}
308
309// Relevant addressing modes contain only base register and immediate
310// displacement or frameindex and immediate displacement.
311// TODO: Consider expanding to other addressing modes in the future
313 int AddrOffset = getAddrOffset(MI);
315 const MachineOperand &Disp = getDispOperand(MI);
316 const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
317 const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
318 const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
319
320 if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
321 return false;
322 if (!Disp.isImm())
323 return false;
324 if (Scale.getImm() != 1)
325 return false;
326 if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
327 return false;
328 if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
329 return false;
330 return true;
331}
332
333// Collect potentially blocking stores.
334// Limit the number of instructions backwards we want to inspect
335// since the effect of store block won't be visible if the store
336// and load instructions have enough instructions in between to
337// keep the core busy.
340 SmallVector<MachineInstr *, 2> PotentialBlockers;
341 unsigned BlockCount = 0;
342 const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
343 for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
344 E = LoadInst->getParent()->rend();
345 PBInst != E; ++PBInst) {
346 if (PBInst->isMetaInstruction())
347 continue;
348 BlockCount++;
349 if (BlockCount >= InspectionLimit)
350 break;
351 MachineInstr &MI = *PBInst;
352 if (MI.getDesc().isCall())
353 return PotentialBlockers;
354 PotentialBlockers.push_back(&MI);
355 }
356 // If we didn't get to the instructions limit try predecessing blocks.
357 // Ideally we should traverse the predecessor blocks in depth with some
358 // coloring algorithm, but for now let's just look at the first order
359 // predecessors.
360 if (BlockCount < InspectionLimit) {
362 int LimitLeft = InspectionLimit - BlockCount;
363 for (MachineBasicBlock *PMBB : MBB->predecessors()) {
364 int PredCount = 0;
365 for (MachineInstr &PBInst : llvm::reverse(*PMBB)) {
366 if (PBInst.isMetaInstruction())
367 continue;
368 PredCount++;
369 if (PredCount >= LimitLeft)
370 break;
371 if (PBInst.getDesc().isCall())
372 break;
373 PotentialBlockers.push_back(&PBInst);
374 }
375 }
376 }
377 return PotentialBlockers;
378}
379
380void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
381 int64_t LoadDisp, MachineInstr *StoreInst,
382 unsigned NStoreOpcode, int64_t StoreDisp,
383 unsigned Size, int64_t LMMOffset,
384 int64_t SMMOffset) {
388 MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
389 MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
390
391 Register Reg1 = MRI->createVirtualRegister(
392 TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
393 MachineInstr *NewLoad =
394 BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
395 Reg1)
396 .add(LoadBase)
397 .addImm(1)
398 .addReg(X86::NoRegister)
399 .addImm(LoadDisp)
400 .addReg(X86::NoRegister)
402 MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
403 if (LoadBase.isReg())
404 getBaseOperand(NewLoad).setIsKill(false);
405 LLVM_DEBUG(NewLoad->dump());
406 // If the load and store are consecutive, use the loadInst location to
407 // reduce register pressure.
408 MachineInstr *StInst = StoreInst;
410 MBB->instr_begin());
411 if (PrevInstrIt.getNodePtr() == LoadInst)
412 StInst = LoadInst;
413 MachineInstr *NewStore =
414 BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
415 .add(StoreBase)
416 .addImm(1)
417 .addReg(X86::NoRegister)
418 .addImm(StoreDisp)
419 .addReg(X86::NoRegister)
420 .addReg(Reg1)
422 MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
423 if (StoreBase.isReg())
424 getBaseOperand(NewStore).setIsKill(false);
426 assert(StoreSrcVReg.isReg() && "Expected virtual register");
427 NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
428 LLVM_DEBUG(NewStore->dump());
429}
430
431void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
432 int64_t LdDispImm, MachineInstr *StoreInst,
433 int64_t StDispImm, int64_t LMMOffset,
434 int64_t SMMOffset) {
435 int LdDisp = LdDispImm;
436 int StDisp = StDispImm;
437 while (Size > 0) {
438 if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
439 Size = Size - MOV128SZ;
440 buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
442 StDisp, MOV128SZ, LMMOffset, SMMOffset);
443 LdDisp += MOV128SZ;
444 StDisp += MOV128SZ;
445 LMMOffset += MOV128SZ;
446 SMMOffset += MOV128SZ;
447 continue;
448 }
449 if (Size - MOV64SZ >= 0) {
450 Size = Size - MOV64SZ;
451 buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
452 MOV64SZ, LMMOffset, SMMOffset);
453 LdDisp += MOV64SZ;
454 StDisp += MOV64SZ;
455 LMMOffset += MOV64SZ;
456 SMMOffset += MOV64SZ;
457 continue;
458 }
459 if (Size - MOV32SZ >= 0) {
460 Size = Size - MOV32SZ;
461 buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
462 MOV32SZ, LMMOffset, SMMOffset);
463 LdDisp += MOV32SZ;
464 StDisp += MOV32SZ;
465 LMMOffset += MOV32SZ;
466 SMMOffset += MOV32SZ;
467 continue;
468 }
469 if (Size - MOV16SZ >= 0) {
470 Size = Size - MOV16SZ;
471 buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
472 MOV16SZ, LMMOffset, SMMOffset);
473 LdDisp += MOV16SZ;
474 StDisp += MOV16SZ;
475 LMMOffset += MOV16SZ;
476 SMMOffset += MOV16SZ;
477 continue;
478 }
479 if (Size - MOV8SZ >= 0) {
480 Size = Size - MOV8SZ;
481 buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
482 MOV8SZ, LMMOffset, SMMOffset);
483 LdDisp += MOV8SZ;
484 StDisp += MOV8SZ;
485 LMMOffset += MOV8SZ;
486 SMMOffset += MOV8SZ;
487 continue;
488 }
489 }
490 assert(Size == 0 && "Wrong size division");
491}
492
496 auto *StorePrevNonDbgInstr =
498 LoadInst->getParent()->instr_begin())
499 .getNodePtr();
500 if (LoadBase.isReg()) {
501 MachineInstr *LastLoad = LoadInst->getPrevNode();
502 // If the original load and store to xmm/ymm were consecutive
503 // then the partial copies were also created in
504 // a consecutive order to reduce register pressure,
505 // and the location of the last load is before the last store.
506 if (StorePrevNonDbgInstr == LoadInst)
507 LastLoad = LoadInst->getPrevNode()->getPrevNode();
508 getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
509 }
510 if (StoreBase.isReg()) {
511 MachineInstr *StInst = StoreInst;
512 if (StorePrevNonDbgInstr == LoadInst)
513 StInst = LoadInst;
514 getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
515 }
516}
517
518bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
519 const MachineMemOperand &Op2) const {
520 if (!Op1.getValue() || !Op2.getValue())
521 return true;
522
523 int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
524 int64_t Overlapa = Op1.getSize().getValue() + Op1.getOffset() - MinOffset;
525 int64_t Overlapb = Op2.getSize().getValue() + Op2.getOffset() - MinOffset;
526
527 return !AA->isNoAlias(
528 MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
529 MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
530}
531
532void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
533 for (auto &MBB : MF)
534 for (auto &MI : MBB) {
535 if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
536 continue;
537 int DefVR = MI.getOperand(0).getReg();
538 if (!MRI->hasOneNonDBGUse(DefVR))
539 continue;
540 for (MachineOperand &StoreMO :
541 llvm::make_early_inc_range(MRI->use_nodbg_operands(DefVR))) {
542 MachineInstr &StoreMI = *StoreMO.getParent();
543 // Skip cases where the memcpy may overlap.
544 if (StoreMI.getParent() == MI.getParent() &&
545 isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
547 isRelevantAddressingMode(&StoreMI) &&
548 MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
549 if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
550 BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
551 }
552 }
553 }
554}
555
556unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
557 const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
558 *LoadInst->getParent()->getParent());
559 return TRI->getRegSizeInBits(*TRC) / 8;
560}
561
562void X86AvoidSFBPass::breakBlockedCopies(
564 const DisplacementSizeMap &BlockingStoresDispSizeMap) {
565 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
566 int64_t StDispImm = getDispOperand(StoreInst).getImm();
567 int64_t LMMOffset = 0;
568 int64_t SMMOffset = 0;
569
570 int64_t LdDisp1 = LdDispImm;
571 int64_t LdDisp2 = 0;
572 int64_t StDisp1 = StDispImm;
573 int64_t StDisp2 = 0;
574 unsigned Size1 = 0;
575 unsigned Size2 = 0;
576 int64_t LdStDelta = StDispImm - LdDispImm;
577
578 for (auto DispSizePair : BlockingStoresDispSizeMap) {
579 LdDisp2 = DispSizePair.first;
580 StDisp2 = DispSizePair.first + LdStDelta;
581 Size2 = DispSizePair.second;
582 // Avoid copying overlapping areas.
583 if (LdDisp2 < LdDisp1) {
584 int OverlapDelta = LdDisp1 - LdDisp2;
585 LdDisp2 += OverlapDelta;
586 StDisp2 += OverlapDelta;
587 Size2 -= OverlapDelta;
588 }
589 Size1 = LdDisp2 - LdDisp1;
590
591 // Build a copy for the point until the current blocking store's
592 // displacement.
593 buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
594 SMMOffset);
595 // Build a copy for the current blocking store.
596 buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
597 SMMOffset + Size1);
598 LdDisp1 = LdDisp2 + Size2;
599 StDisp1 = StDisp2 + Size2;
600 LMMOffset += Size1 + Size2;
601 SMMOffset += Size1 + Size2;
602 }
603 unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
604 buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
605 LMMOffset);
606}
607
610 const MachineOperand &LoadBase = getBaseOperand(LoadInst);
611 const MachineOperand &StoreBase = getBaseOperand(StoreInst);
612 if (LoadBase.isReg() != StoreBase.isReg())
613 return false;
614 if (LoadBase.isReg())
615 return LoadBase.getReg() == StoreBase.getReg();
616 return LoadBase.getIndex() == StoreBase.getIndex();
617}
618
619static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
620 int64_t StoreDispImm, unsigned StoreSize) {
621 return ((StoreDispImm >= LoadDispImm) &&
622 (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
623}
624
625// Keep track of all stores blocking a load
626static void
627updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
628 int64_t DispImm, unsigned Size) {
629 if (BlockingStoresDispSizeMap.count(DispImm)) {
630 // Choose the smallest blocking store starting at this displacement.
631 if (BlockingStoresDispSizeMap[DispImm] > Size)
632 BlockingStoresDispSizeMap[DispImm] = Size;
633
634 } else
635 BlockingStoresDispSizeMap[DispImm] = Size;
636}
637
638// Remove blocking stores contained in each other.
639static void
640removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
641 if (BlockingStoresDispSizeMap.size() <= 1)
642 return;
643
645 for (auto DispSizePair : BlockingStoresDispSizeMap) {
646 int64_t CurrDisp = DispSizePair.first;
647 unsigned CurrSize = DispSizePair.second;
648 while (DispSizeStack.size()) {
649 int64_t PrevDisp = DispSizeStack.back().first;
650 unsigned PrevSize = DispSizeStack.back().second;
651 if (CurrDisp + CurrSize > PrevDisp + PrevSize)
652 break;
653 DispSizeStack.pop_back();
654 }
655 DispSizeStack.push_back(DispSizePair);
656 }
657 BlockingStoresDispSizeMap.clear();
658 for (auto Disp : DispSizeStack)
659 BlockingStoresDispSizeMap.insert(Disp);
660}
661
662bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
663 bool Changed = false;
664
665 if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||
666 !MF.getSubtarget<X86Subtarget>().is64Bit())
667 return false;
668
669 MRI = &MF.getRegInfo();
670 assert(MRI->isSSA() && "Expected MIR to be in SSA form");
671 TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
672 TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
673 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
674 LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
675 // Look for a load then a store to XMM/YMM which look like a memcpy
676 findPotentiallylBlockedCopies(MF);
677
678 for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
679 MachineInstr *LoadInst = LoadStoreInstPair.first;
680 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
681 DisplacementSizeMap BlockingStoresDispSizeMap;
682
683 SmallVector<MachineInstr *, 2> PotentialBlockers =
685 for (auto *PBInst : PotentialBlockers) {
686 if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
687 LoadInst->getOpcode()) ||
688 !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
689 continue;
690 int64_t PBstDispImm = getDispOperand(PBInst).getImm();
691 unsigned PBstSize = (*PBInst->memoperands_begin())->getSize().getValue();
692 // This check doesn't cover all cases, but it will suffice for now.
693 // TODO: take branch probability into consideration, if the blocking
694 // store is in an unreached block, breaking the memcopy could lose
695 // performance.
696 if (hasSameBaseOpValue(LoadInst, PBInst) &&
697 isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
698 PBstSize))
699 updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
700 PBstSize);
701 }
702
703 if (BlockingStoresDispSizeMap.empty())
704 continue;
705
706 // We found a store forward block, break the memcpy's load and store
707 // into smaller copies such that each smaller store that was causing
708 // a store block would now be copied separately.
709 MachineInstr *StoreInst = LoadStoreInstPair.second;
710 LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
713 LLVM_DEBUG(dbgs() << "Replaced with:\n");
714 removeRedundantBlockingStores(BlockingStoresDispSizeMap);
715 breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
717 ForRemoval.push_back(LoadInst);
718 ForRemoval.push_back(StoreInst);
719 }
720 for (auto *RemovedInst : ForRemoval) {
721 RemovedInst->eraseFromParent();
722 }
723 ForRemoval.clear();
724 BlockedLoadsStoresPairs.clear();
725 LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
726
727 return Changed;
728}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
COFF::MachineTypes Machine
Definition: COFFYAML.cpp:371
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Machine code sinking
static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode)
static bool isPotentialBlockedMemCpyLd(unsigned Opcode)
static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode)
static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode)
static const int MOV64SZ
static const int MOV8SZ
static bool isXMMLoadOpcode(unsigned Opcode)
static int getAddrOffset(const MachineInstr *MI)
static cl::opt< unsigned > X86AvoidSFBInspectionLimit("x86-sfb-inspection-limit", cl::desc("X86: Number of instructions backward to " "inspect for store forwarding blocks."), cl::init(20), cl::Hidden)
static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize, int64_t StoreDispImm, unsigned StoreSize)
static bool isRelevantAddressingMode(MachineInstr *MI)
static cl::opt< bool > DisableX86AvoidStoreForwardBlocks("x86-disable-avoid-SFB", cl::Hidden, cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false))
static void removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap)
static const int MOV16SZ
static bool hasSameBaseOpValue(MachineInstr *LoadInst, MachineInstr *StoreInst)
static void updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap, int64_t DispImm, unsigned Size)
static MachineOperand & getBaseOperand(MachineInstr *MI)
static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode)
static SmallVector< MachineInstr *, 2 > findPotentialBlockers(MachineInstr *LoadInst)
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst)
static const int MOV32SZ
static MachineOperand & getDispOperand(MachineInstr *MI)
static bool isYMMLoadOpcode(unsigned Opcode)
static const int MOV128SZ
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
An instruction for reading from memory.
Definition: Instructions.h:174
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
instr_iterator instr_begin()
void push_back(MachineInstr *MI)
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:815
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:800
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Representation for a specific memory location.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Value * getOperand(unsigned i) const
Definition: User.h:169
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5266
const ParentTy * getParent() const
Definition: ilist_node.h:32
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
int getMemoryOperandNo(uint64_t TSFlags)
Definition: X86BaseInfo.h:1011
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
Definition: X86BaseInfo.h:968
@ AddrScaleAmt
Definition: X86BaseInfo.h:30
@ AddrSegmentReg
Definition: X86BaseInfo.h:34
@ AddrIndexReg
Definition: X86BaseInfo.h:31
@ AddrNumOperands
Definition: X86BaseInfo.h:36
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It, then continue decrementing it while it points to a debug instruction.
FunctionPass * createX86AvoidStoreForwardingBlocks()
Return a pass that avoids creating store forward block issues in the hardware.