LLVM 20.0.0git
X86AvoidStoreForwardingBlocks.cpp
Go to the documentation of this file.
1//===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// If a load follows a store and reloads data that the store has written to
10// memory, Intel microarchitectures can in many cases forward the data directly
11// from the store to the load, This "store forwarding" saves cycles by enabling
12// the load to directly obtain the data instead of accessing the data from
13// cache or memory.
14// A "store forward block" occurs in cases that a store cannot be forwarded to
15// the load. The most typical case of store forward block on Intel Core
16// microarchitecture that a small store cannot be forwarded to a large load.
17// The estimated penalty for a store forward block is ~13 cycles.
18//
19// This pass tries to recognize and handle cases where "store forward block"
20// is created by the compiler when lowering memcpy calls to a sequence
21// of a load and a store.
22//
23// The pass currently only handles cases where memcpy is lowered to
24// XMM/YMM registers, it tries to break the memcpy into smaller copies.
25// breaking the memcpy should be possible since there is no atomicity
26// guarantee for loads and stores to XMM/YMM.
27//
28// It could be better for performance to solve the problem by loading
29// to XMM/YMM then inserting the partial store before storing back from XMM/YMM
30// to memory, but this will result in a more conservative optimization since it
31// requires we prove that all memory accesses between the blocking store and the
32// load must alias/don't alias before we can move the store, whereas the
33// transformation done here is correct regardless to other memory accesses.
34//===----------------------------------------------------------------------===//
35
36#include "X86.h"
37#include "X86InstrInfo.h"
38#include "X86Subtarget.h"
47#include "llvm/IR/DebugLoc.h"
48#include "llvm/IR/Function.h"
50#include "llvm/MC/MCInstrDesc.h"
51
52using namespace llvm;
53
54#define DEBUG_TYPE "x86-avoid-SFB"
55
57 "x86-disable-avoid-SFB", cl::Hidden,
58 cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
59
61 "x86-sfb-inspection-limit",
62 cl::desc("X86: Number of instructions backward to "
63 "inspect for store forwarding blocks."),
64 cl::init(20), cl::Hidden);
65
66namespace {
67
68using DisplacementSizeMap = std::map<int64_t, unsigned>;
69
70class X86AvoidSFBPass : public MachineFunctionPass {
71public:
72 static char ID;
73 X86AvoidSFBPass() : MachineFunctionPass(ID) { }
74
75 StringRef getPassName() const override {
76 return "X86 Avoid Store Forwarding Blocks";
77 }
78
79 bool runOnMachineFunction(MachineFunction &MF) override;
80
81 void getAnalysisUsage(AnalysisUsage &AU) const override {
84 }
85
86private:
87 MachineRegisterInfo *MRI = nullptr;
88 const X86InstrInfo *TII = nullptr;
89 const X86RegisterInfo *TRI = nullptr;
91 BlockedLoadsStoresPairs;
93 AliasAnalysis *AA = nullptr;
94
95 /// Returns couples of Load then Store to memory which look
96 /// like a memcpy.
97 void findPotentiallylBlockedCopies(MachineFunction &MF);
98 /// Break the memcpy's load and store into smaller copies
99 /// such that each memory load that was blocked by a smaller store
100 /// would now be copied separately.
101 void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
102 const DisplacementSizeMap &BlockingStoresDispSizeMap);
103 /// Break a copy of size Size to smaller copies.
104 void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
105 MachineInstr *StoreInst, int64_t StDispImm,
106 int64_t LMMOffset, int64_t SMMOffset);
107
108 void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
109 MachineInstr *StoreInst, unsigned NStoreOpcode,
110 int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
111 int64_t SMMOffset);
112
113 bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
114
115 unsigned getRegSizeInBytes(MachineInstr *Inst);
116};
117
118} // end anonymous namespace
119
120char X86AvoidSFBPass::ID = 0;
121
122INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
123 false, false)
126 false)
127
129 return new X86AvoidSFBPass();
130}
131
132static bool isXMMLoadOpcode(unsigned Opcode) {
133 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
134 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
135 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
136 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
137 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
138 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
139 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
140 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
141}
142static bool isYMMLoadOpcode(unsigned Opcode) {
143 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
144 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
145 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
146 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
147 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
148 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
149 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
150}
151
152static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
153 return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
154}
155
156static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
157 switch (LdOpcode) {
158 case X86::MOVUPSrm:
159 case X86::MOVAPSrm:
160 return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
161 case X86::VMOVUPSrm:
162 case X86::VMOVAPSrm:
163 return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
164 case X86::VMOVUPDrm:
165 case X86::VMOVAPDrm:
166 return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
167 case X86::VMOVDQUrm:
168 case X86::VMOVDQArm:
169 return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
170 case X86::VMOVUPSZ128rm:
171 case X86::VMOVAPSZ128rm:
172 return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
173 case X86::VMOVUPDZ128rm:
174 case X86::VMOVAPDZ128rm:
175 return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
176 case X86::VMOVUPSYrm:
177 case X86::VMOVAPSYrm:
178 return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
179 case X86::VMOVUPDYrm:
180 case X86::VMOVAPDYrm:
181 return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
182 case X86::VMOVDQUYrm:
183 case X86::VMOVDQAYrm:
184 return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
185 case X86::VMOVUPSZ256rm:
186 case X86::VMOVAPSZ256rm:
187 return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
188 case X86::VMOVUPDZ256rm:
189 case X86::VMOVAPDZ256rm:
190 return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
191 case X86::VMOVDQU64Z128rm:
192 case X86::VMOVDQA64Z128rm:
193 return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
194 case X86::VMOVDQU32Z128rm:
195 case X86::VMOVDQA32Z128rm:
196 return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
197 case X86::VMOVDQU64Z256rm:
198 case X86::VMOVDQA64Z256rm:
199 return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
200 case X86::VMOVDQU32Z256rm:
201 case X86::VMOVDQA32Z256rm:
202 return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
203 default:
204 return false;
205 }
206}
207
208static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
209 bool PBlock = false;
210 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
211 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
212 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
213 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
214 if (isYMMLoadOpcode(LoadOpcode))
215 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
216 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
217 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
218 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
219 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
220 Opcode == X86::VMOVDQU64Z128mr ||
221 Opcode == X86::VMOVDQA64Z128mr ||
222 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
223 return PBlock;
224}
225
226static const int MOV128SZ = 16;
227static const int MOV64SZ = 8;
228static const int MOV32SZ = 4;
229static const int MOV16SZ = 2;
230static const int MOV8SZ = 1;
231
232static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
233 switch (LoadOpcode) {
234 case X86::VMOVUPSYrm:
235 case X86::VMOVAPSYrm:
236 return X86::VMOVUPSrm;
237 case X86::VMOVUPDYrm:
238 case X86::VMOVAPDYrm:
239 return X86::VMOVUPDrm;
240 case X86::VMOVDQUYrm:
241 case X86::VMOVDQAYrm:
242 return X86::VMOVDQUrm;
243 case X86::VMOVUPSZ256rm:
244 case X86::VMOVAPSZ256rm:
245 return X86::VMOVUPSZ128rm;
246 case X86::VMOVUPDZ256rm:
247 case X86::VMOVAPDZ256rm:
248 return X86::VMOVUPDZ128rm;
249 case X86::VMOVDQU64Z256rm:
250 case X86::VMOVDQA64Z256rm:
251 return X86::VMOVDQU64Z128rm;
252 case X86::VMOVDQU32Z256rm:
253 case X86::VMOVDQA32Z256rm:
254 return X86::VMOVDQU32Z128rm;
255 default:
256 llvm_unreachable("Unexpected Load Instruction Opcode");
257 }
258 return 0;
259}
260
261static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
262 switch (StoreOpcode) {
263 case X86::VMOVUPSYmr:
264 case X86::VMOVAPSYmr:
265 return X86::VMOVUPSmr;
266 case X86::VMOVUPDYmr:
267 case X86::VMOVAPDYmr:
268 return X86::VMOVUPDmr;
269 case X86::VMOVDQUYmr:
270 case X86::VMOVDQAYmr:
271 return X86::VMOVDQUmr;
272 case X86::VMOVUPSZ256mr:
273 case X86::VMOVAPSZ256mr:
274 return X86::VMOVUPSZ128mr;
275 case X86::VMOVUPDZ256mr:
276 case X86::VMOVAPDZ256mr:
277 return X86::VMOVUPDZ128mr;
278 case X86::VMOVDQU64Z256mr:
279 case X86::VMOVDQA64Z256mr:
280 return X86::VMOVDQU64Z128mr;
281 case X86::VMOVDQU32Z256mr:
282 case X86::VMOVDQA32Z256mr:
283 return X86::VMOVDQU32Z128mr;
284 default:
285 llvm_unreachable("Unexpected Load Instruction Opcode");
286 }
287 return 0;
288}
289
290static int getAddrOffset(const MachineInstr *MI) {
291 const MCInstrDesc &Descl = MI->getDesc();
292 int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
293 assert(AddrOffset != -1 && "Expected Memory Operand");
294 AddrOffset += X86II::getOperandBias(Descl);
295 return AddrOffset;
296}
297
299 int AddrOffset = getAddrOffset(MI);
300 return MI->getOperand(AddrOffset + X86::AddrBaseReg);
301}
302
304 int AddrOffset = getAddrOffset(MI);
305 return MI->getOperand(AddrOffset + X86::AddrDisp);
306}
307
308// Relevant addressing modes contain only base register and immediate
309// displacement or frameindex and immediate displacement.
310// TODO: Consider expanding to other addressing modes in the future
312 int AddrOffset = getAddrOffset(MI);
314 const MachineOperand &Disp = getDispOperand(MI);
315 const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
316 const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
317 const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
318
319 if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
320 return false;
321 if (!Disp.isImm())
322 return false;
323 if (Scale.getImm() != 1)
324 return false;
325 if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
326 return false;
327 if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
328 return false;
329 return true;
330}
331
332// Collect potentially blocking stores.
333// Limit the number of instructions backwards we want to inspect
334// since the effect of store block won't be visible if the store
335// and load instructions have enough instructions in between to
336// keep the core busy.
339 SmallVector<MachineInstr *, 2> PotentialBlockers;
340 unsigned BlockCount = 0;
341 const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
342 for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
343 E = LoadInst->getParent()->rend();
344 PBInst != E; ++PBInst) {
345 if (PBInst->isMetaInstruction())
346 continue;
347 BlockCount++;
348 if (BlockCount >= InspectionLimit)
349 break;
350 MachineInstr &MI = *PBInst;
351 if (MI.getDesc().isCall())
352 return PotentialBlockers;
353 PotentialBlockers.push_back(&MI);
354 }
355 // If we didn't get to the instructions limit try predecessing blocks.
356 // Ideally we should traverse the predecessor blocks in depth with some
357 // coloring algorithm, but for now let's just look at the first order
358 // predecessors.
359 if (BlockCount < InspectionLimit) {
361 int LimitLeft = InspectionLimit - BlockCount;
362 for (MachineBasicBlock *PMBB : MBB->predecessors()) {
363 int PredCount = 0;
364 for (MachineInstr &PBInst : llvm::reverse(*PMBB)) {
365 if (PBInst.isMetaInstruction())
366 continue;
367 PredCount++;
368 if (PredCount >= LimitLeft)
369 break;
370 if (PBInst.getDesc().isCall())
371 break;
372 PotentialBlockers.push_back(&PBInst);
373 }
374 }
375 }
376 return PotentialBlockers;
377}
378
379void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
380 int64_t LoadDisp, MachineInstr *StoreInst,
381 unsigned NStoreOpcode, int64_t StoreDisp,
382 unsigned Size, int64_t LMMOffset,
383 int64_t SMMOffset) {
387 MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
388 MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
389
390 Register Reg1 = MRI->createVirtualRegister(
391 TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
392 MachineInstr *NewLoad =
393 BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
394 Reg1)
395 .add(LoadBase)
396 .addImm(1)
397 .addReg(X86::NoRegister)
398 .addImm(LoadDisp)
399 .addReg(X86::NoRegister)
401 MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
402 if (LoadBase.isReg())
403 getBaseOperand(NewLoad).setIsKill(false);
404 LLVM_DEBUG(NewLoad->dump());
405 // If the load and store are consecutive, use the loadInst location to
406 // reduce register pressure.
407 MachineInstr *StInst = StoreInst;
409 MBB->instr_begin());
410 if (PrevInstrIt.getNodePtr() == LoadInst)
411 StInst = LoadInst;
412 MachineInstr *NewStore =
413 BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
414 .add(StoreBase)
415 .addImm(1)
416 .addReg(X86::NoRegister)
417 .addImm(StoreDisp)
418 .addReg(X86::NoRegister)
419 .addReg(Reg1)
421 MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
422 if (StoreBase.isReg())
423 getBaseOperand(NewStore).setIsKill(false);
425 assert(StoreSrcVReg.isReg() && "Expected virtual register");
426 NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
427 LLVM_DEBUG(NewStore->dump());
428}
429
430void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
431 int64_t LdDispImm, MachineInstr *StoreInst,
432 int64_t StDispImm, int64_t LMMOffset,
433 int64_t SMMOffset) {
434 int LdDisp = LdDispImm;
435 int StDisp = StDispImm;
436 while (Size > 0) {
437 if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
438 Size = Size - MOV128SZ;
439 buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
441 StDisp, MOV128SZ, LMMOffset, SMMOffset);
442 LdDisp += MOV128SZ;
443 StDisp += MOV128SZ;
444 LMMOffset += MOV128SZ;
445 SMMOffset += MOV128SZ;
446 continue;
447 }
448 if (Size - MOV64SZ >= 0) {
449 Size = Size - MOV64SZ;
450 buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
451 MOV64SZ, LMMOffset, SMMOffset);
452 LdDisp += MOV64SZ;
453 StDisp += MOV64SZ;
454 LMMOffset += MOV64SZ;
455 SMMOffset += MOV64SZ;
456 continue;
457 }
458 if (Size - MOV32SZ >= 0) {
459 Size = Size - MOV32SZ;
460 buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
461 MOV32SZ, LMMOffset, SMMOffset);
462 LdDisp += MOV32SZ;
463 StDisp += MOV32SZ;
464 LMMOffset += MOV32SZ;
465 SMMOffset += MOV32SZ;
466 continue;
467 }
468 if (Size - MOV16SZ >= 0) {
469 Size = Size - MOV16SZ;
470 buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
471 MOV16SZ, LMMOffset, SMMOffset);
472 LdDisp += MOV16SZ;
473 StDisp += MOV16SZ;
474 LMMOffset += MOV16SZ;
475 SMMOffset += MOV16SZ;
476 continue;
477 }
478 if (Size - MOV8SZ >= 0) {
479 Size = Size - MOV8SZ;
480 buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
481 MOV8SZ, LMMOffset, SMMOffset);
482 LdDisp += MOV8SZ;
483 StDisp += MOV8SZ;
484 LMMOffset += MOV8SZ;
485 SMMOffset += MOV8SZ;
486 continue;
487 }
488 }
489 assert(Size == 0 && "Wrong size division");
490}
491
495 auto *StorePrevNonDbgInstr =
497 LoadInst->getParent()->instr_begin())
498 .getNodePtr();
499 if (LoadBase.isReg()) {
500 MachineInstr *LastLoad = LoadInst->getPrevNode();
501 // If the original load and store to xmm/ymm were consecutive
502 // then the partial copies were also created in
503 // a consecutive order to reduce register pressure,
504 // and the location of the last load is before the last store.
505 if (StorePrevNonDbgInstr == LoadInst)
506 LastLoad = LoadInst->getPrevNode()->getPrevNode();
507 getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
508 }
509 if (StoreBase.isReg()) {
510 MachineInstr *StInst = StoreInst;
511 if (StorePrevNonDbgInstr == LoadInst)
512 StInst = LoadInst;
513 getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
514 }
515}
516
517bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
518 const MachineMemOperand &Op2) const {
519 if (!Op1.getValue() || !Op2.getValue())
520 return true;
521
522 int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
523 int64_t Overlapa = Op1.getSize().getValue() + Op1.getOffset() - MinOffset;
524 int64_t Overlapb = Op2.getSize().getValue() + Op2.getOffset() - MinOffset;
525
526 return !AA->isNoAlias(
527 MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
528 MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
529}
530
531void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
532 for (auto &MBB : MF)
533 for (auto &MI : MBB) {
534 if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
535 continue;
536 int DefVR = MI.getOperand(0).getReg();
537 if (!MRI->hasOneNonDBGUse(DefVR))
538 continue;
539 for (MachineOperand &StoreMO :
540 llvm::make_early_inc_range(MRI->use_nodbg_operands(DefVR))) {
541 MachineInstr &StoreMI = *StoreMO.getParent();
542 // Skip cases where the memcpy may overlap.
543 if (StoreMI.getParent() == MI.getParent() &&
544 isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
546 isRelevantAddressingMode(&StoreMI) &&
547 MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
548 if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
549 BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
550 }
551 }
552 }
553}
554
555unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
556 const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
557 *LoadInst->getParent()->getParent());
558 return TRI->getRegSizeInBits(*TRC) / 8;
559}
560
561void X86AvoidSFBPass::breakBlockedCopies(
563 const DisplacementSizeMap &BlockingStoresDispSizeMap) {
564 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
565 int64_t StDispImm = getDispOperand(StoreInst).getImm();
566 int64_t LMMOffset = 0;
567 int64_t SMMOffset = 0;
568
569 int64_t LdDisp1 = LdDispImm;
570 int64_t LdDisp2 = 0;
571 int64_t StDisp1 = StDispImm;
572 int64_t StDisp2 = 0;
573 unsigned Size1 = 0;
574 unsigned Size2 = 0;
575 int64_t LdStDelta = StDispImm - LdDispImm;
576
577 for (auto DispSizePair : BlockingStoresDispSizeMap) {
578 LdDisp2 = DispSizePair.first;
579 StDisp2 = DispSizePair.first + LdStDelta;
580 Size2 = DispSizePair.second;
581 // Avoid copying overlapping areas.
582 if (LdDisp2 < LdDisp1) {
583 int OverlapDelta = LdDisp1 - LdDisp2;
584 LdDisp2 += OverlapDelta;
585 StDisp2 += OverlapDelta;
586 Size2 -= OverlapDelta;
587 }
588 Size1 = LdDisp2 - LdDisp1;
589
590 // Build a copy for the point until the current blocking store's
591 // displacement.
592 buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
593 SMMOffset);
594 // Build a copy for the current blocking store.
595 buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
596 SMMOffset + Size1);
597 LdDisp1 = LdDisp2 + Size2;
598 StDisp1 = StDisp2 + Size2;
599 LMMOffset += Size1 + Size2;
600 SMMOffset += Size1 + Size2;
601 }
602 unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
603 buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
604 LMMOffset);
605}
606
609 const MachineOperand &LoadBase = getBaseOperand(LoadInst);
610 const MachineOperand &StoreBase = getBaseOperand(StoreInst);
611 if (LoadBase.isReg() != StoreBase.isReg())
612 return false;
613 if (LoadBase.isReg())
614 return LoadBase.getReg() == StoreBase.getReg();
615 return LoadBase.getIndex() == StoreBase.getIndex();
616}
617
618static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
619 int64_t StoreDispImm, unsigned StoreSize) {
620 return ((StoreDispImm >= LoadDispImm) &&
621 (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
622}
623
624// Keep track of all stores blocking a load
625static void
626updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
627 int64_t DispImm, unsigned Size) {
628 if (BlockingStoresDispSizeMap.count(DispImm)) {
629 // Choose the smallest blocking store starting at this displacement.
630 if (BlockingStoresDispSizeMap[DispImm] > Size)
631 BlockingStoresDispSizeMap[DispImm] = Size;
632
633 } else
634 BlockingStoresDispSizeMap[DispImm] = Size;
635}
636
637// Remove blocking stores contained in each other.
638static void
639removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
640 if (BlockingStoresDispSizeMap.size() <= 1)
641 return;
642
644 for (auto DispSizePair : BlockingStoresDispSizeMap) {
645 int64_t CurrDisp = DispSizePair.first;
646 unsigned CurrSize = DispSizePair.second;
647 while (DispSizeStack.size()) {
648 int64_t PrevDisp = DispSizeStack.back().first;
649 unsigned PrevSize = DispSizeStack.back().second;
650 if (CurrDisp + CurrSize > PrevDisp + PrevSize)
651 break;
652 DispSizeStack.pop_back();
653 }
654 DispSizeStack.push_back(DispSizePair);
655 }
656 BlockingStoresDispSizeMap.clear();
657 for (auto Disp : DispSizeStack)
658 BlockingStoresDispSizeMap.insert(Disp);
659}
660
661bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
662 bool Changed = false;
663
664 if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||
665 !MF.getSubtarget<X86Subtarget>().is64Bit())
666 return false;
667
668 MRI = &MF.getRegInfo();
669 assert(MRI->isSSA() && "Expected MIR to be in SSA form");
670 TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
671 TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
672 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
673 LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
674 // Look for a load then a store to XMM/YMM which look like a memcpy
675 findPotentiallylBlockedCopies(MF);
676
677 for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
678 MachineInstr *LoadInst = LoadStoreInstPair.first;
679 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
680 DisplacementSizeMap BlockingStoresDispSizeMap;
681
682 SmallVector<MachineInstr *, 2> PotentialBlockers =
684 for (auto *PBInst : PotentialBlockers) {
685 if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
686 LoadInst->getOpcode()) ||
687 !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
688 continue;
689 int64_t PBstDispImm = getDispOperand(PBInst).getImm();
690 unsigned PBstSize = (*PBInst->memoperands_begin())->getSize().getValue();
691 // This check doesn't cover all cases, but it will suffice for now.
692 // TODO: take branch probability into consideration, if the blocking
693 // store is in an unreached block, breaking the memcopy could lose
694 // performance.
695 if (hasSameBaseOpValue(LoadInst, PBInst) &&
696 isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
697 PBstSize))
698 updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
699 PBstSize);
700 }
701
702 if (BlockingStoresDispSizeMap.empty())
703 continue;
704
705 // We found a store forward block, break the memcpy's load and store
706 // into smaller copies such that each smaller store that was causing
707 // a store block would now be copied separately.
708 MachineInstr *StoreInst = LoadStoreInstPair.second;
709 LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
712 LLVM_DEBUG(dbgs() << "Replaced with:\n");
713 removeRedundantBlockingStores(BlockingStoresDispSizeMap);
714 breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
716 ForRemoval.push_back(LoadInst);
717 ForRemoval.push_back(StoreInst);
718 }
719 for (auto *RemovedInst : ForRemoval) {
720 RemovedInst->eraseFromParent();
721 }
722 ForRemoval.clear();
723 BlockedLoadsStoresPairs.clear();
724 LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
725
726 return Changed;
727}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
COFF::MachineTypes Machine
Definition: COFFYAML.cpp:390
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Machine code sinking
static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode)
static bool isPotentialBlockedMemCpyLd(unsigned Opcode)
static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode)
static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode)
static const int MOV64SZ
static const int MOV8SZ
static bool isXMMLoadOpcode(unsigned Opcode)
static int getAddrOffset(const MachineInstr *MI)
static cl::opt< unsigned > X86AvoidSFBInspectionLimit("x86-sfb-inspection-limit", cl::desc("X86: Number of instructions backward to " "inspect for store forwarding blocks."), cl::init(20), cl::Hidden)
static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize, int64_t StoreDispImm, unsigned StoreSize)
static bool isRelevantAddressingMode(MachineInstr *MI)
static cl::opt< bool > DisableX86AvoidStoreForwardBlocks("x86-disable-avoid-SFB", cl::Hidden, cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false))
static void removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap)
static const int MOV16SZ
static bool hasSameBaseOpValue(MachineInstr *LoadInst, MachineInstr *StoreInst)
static void updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap, int64_t DispImm, unsigned Size)
static MachineOperand & getBaseOperand(MachineInstr *MI)
static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode)
static SmallVector< MachineInstr *, 2 > findPotentialBlockers(MachineInstr *LoadInst)
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst)
static const int MOV32SZ
static MachineOperand & getDispOperand(MachineInstr *MI)
static bool isYMMLoadOpcode(unsigned Opcode)
static const int MOV128SZ
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
An instruction for reading from memory.
Definition: Instructions.h:176
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
instr_iterator instr_begin()
void push_back(MachineInstr *MI)
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:821
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:806
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:499
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Representation for a specific memory location.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
size_t size() const
Definition: SmallVector.h:78
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Value * getOperand(unsigned i) const
Definition: User.h:228
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5304
const ParentTy * getParent() const
Definition: ilist_node.h:32
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
int getMemoryOperandNo(uint64_t TSFlags)
Definition: X86BaseInfo.h:1011
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
Definition: X86BaseInfo.h:968
@ AddrScaleAmt
Definition: X86BaseInfo.h:30
@ AddrSegmentReg
Definition: X86BaseInfo.h:34
@ AddrIndexReg
Definition: X86BaseInfo.h:31
@ AddrNumOperands
Definition: X86BaseInfo.h:36
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It, then continue decrementing it while it points to a debug instruction.
FunctionPass * createX86AvoidStoreForwardingBlocks()
Return a pass that avoids creating store forward block issues in the hardware.