55#define DEBUG_TYPE "x86-avoid-SFB"
62 "x86-sfb-inspection-limit",
63 cl::desc(
"X86: Number of instructions backward to "
64 "inspect for store forwarding blocks."),
69using DisplacementSizeMap = std::map<int64_t, unsigned>;
77 return "X86 Avoid Store Forwarding Blocks";
92 BlockedLoadsStoresPairs;
103 const DisplacementSizeMap &BlockingStoresDispSizeMap);
107 int64_t LMMOffset, int64_t SMMOffset);
111 int64_t StoreDisp,
unsigned Size, int64_t LMMOffset,
121char X86AvoidSFBPass::ID = 0;
130 return new X86AvoidSFBPass();
134 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
135 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
136 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
137 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
138 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
139 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
140 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
141 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
144 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
145 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
146 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
147 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
148 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
149 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
150 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
161 return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
164 return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
167 return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
170 return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
171 case X86::VMOVUPSZ128rm:
172 case X86::VMOVAPSZ128rm:
173 return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
174 case X86::VMOVUPDZ128rm:
175 case X86::VMOVAPDZ128rm:
176 return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
177 case X86::VMOVUPSYrm:
178 case X86::VMOVAPSYrm:
179 return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
180 case X86::VMOVUPDYrm:
181 case X86::VMOVAPDYrm:
182 return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
183 case X86::VMOVDQUYrm:
184 case X86::VMOVDQAYrm:
185 return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
186 case X86::VMOVUPSZ256rm:
187 case X86::VMOVAPSZ256rm:
188 return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
189 case X86::VMOVUPDZ256rm:
190 case X86::VMOVAPDZ256rm:
191 return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
192 case X86::VMOVDQU64Z128rm:
193 case X86::VMOVDQA64Z128rm:
194 return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
195 case X86::VMOVDQU32Z128rm:
196 case X86::VMOVDQA32Z128rm:
197 return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
198 case X86::VMOVDQU64Z256rm:
199 case X86::VMOVDQA64Z256rm:
200 return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
201 case X86::VMOVDQU32Z256rm:
202 case X86::VMOVDQA32Z256rm:
203 return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
211 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
212 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
213 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
214 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
216 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
217 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
218 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
219 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
220 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
221 Opcode == X86::VMOVDQU64Z128mr ||
222 Opcode == X86::VMOVDQA64Z128mr ||
223 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
234 switch (LoadOpcode) {
235 case X86::VMOVUPSYrm:
236 case X86::VMOVAPSYrm:
237 return X86::VMOVUPSrm;
238 case X86::VMOVUPDYrm:
239 case X86::VMOVAPDYrm:
240 return X86::VMOVUPDrm;
241 case X86::VMOVDQUYrm:
242 case X86::VMOVDQAYrm:
243 return X86::VMOVDQUrm;
244 case X86::VMOVUPSZ256rm:
245 case X86::VMOVAPSZ256rm:
246 return X86::VMOVUPSZ128rm;
247 case X86::VMOVUPDZ256rm:
248 case X86::VMOVAPDZ256rm:
249 return X86::VMOVUPDZ128rm;
250 case X86::VMOVDQU64Z256rm:
251 case X86::VMOVDQA64Z256rm:
252 return X86::VMOVDQU64Z128rm;
253 case X86::VMOVDQU32Z256rm:
254 case X86::VMOVDQA32Z256rm:
255 return X86::VMOVDQU32Z128rm;
263 switch (StoreOpcode) {
264 case X86::VMOVUPSYmr:
265 case X86::VMOVAPSYmr:
266 return X86::VMOVUPSmr;
267 case X86::VMOVUPDYmr:
268 case X86::VMOVAPDYmr:
269 return X86::VMOVUPDmr;
270 case X86::VMOVDQUYmr:
271 case X86::VMOVDQAYmr:
272 return X86::VMOVDQUmr;
273 case X86::VMOVUPSZ256mr:
274 case X86::VMOVAPSZ256mr:
275 return X86::VMOVUPSZ128mr;
276 case X86::VMOVUPDZ256mr:
277 case X86::VMOVAPDZ256mr:
278 return X86::VMOVUPDZ128mr;
279 case X86::VMOVDQU64Z256mr:
280 case X86::VMOVDQA64Z256mr:
281 return X86::VMOVDQU64Z128mr;
282 case X86::VMOVDQU32Z256mr:
283 case X86::VMOVDQA32Z256mr:
284 return X86::VMOVDQU32Z128mr;
294 assert(AddrOffset != -1 &&
"Expected Memory Operand");
320 if (!((
Base.isReg() &&
Base.getReg() != X86::NoRegister) ||
Base.isFI()))
326 if (!(
Index.isReg() &&
Index.getReg() == X86::NoRegister))
328 if (!(Segment.
isReg() && Segment.
getReg() == X86::NoRegister))
341 unsigned BlockCount = 0;
345 PBInst != E; ++PBInst) {
346 if (PBInst->isMetaInstruction())
349 if (BlockCount >= InspectionLimit)
352 if (
MI.getDesc().isCall())
353 return PotentialBlockers;
360 if (BlockCount < InspectionLimit) {
362 int LimitLeft = InspectionLimit - BlockCount;
366 if (PBInst.isMetaInstruction())
369 if (PredCount >= LimitLeft)
371 if (PBInst.getDesc().isCall())
377 return PotentialBlockers;
382 unsigned NStoreOpcode, int64_t StoreDisp,
383 unsigned Size, int64_t LMMOffset,
403 if (LoadBase.
isReg())
411 if (PrevInstrIt.getNodePtr() ==
LoadInst)
423 if (StoreBase.
isReg())
426 assert(StoreSrcVReg.
isReg() &&
"Expected virtual register");
433 int64_t StDispImm, int64_t LMMOffset,
435 int LdDisp = LdDispImm;
436 int StDisp = StDispImm;
442 StDisp,
MOV128SZ, LMMOffset, SMMOffset);
452 MOV64SZ, LMMOffset, SMMOffset);
462 MOV32SZ, LMMOffset, SMMOffset);
472 MOV16SZ, LMMOffset, SMMOffset);
482 MOV8SZ, LMMOffset, SMMOffset);
496 auto *StorePrevNonDbgInstr =
500 if (LoadBase.
isReg()) {
506 if (StorePrevNonDbgInstr ==
LoadInst)
510 if (StoreBase.
isReg()) {
512 if (StorePrevNonDbgInstr ==
LoadInst)
527 return !AA->isNoAlias(
532void X86AvoidSFBPass::findPotentiallylBlockedCopies(
MachineFunction &MF) {
534 for (
auto &
MI :
MBB) {
537 int DefVR =
MI.getOperand(0).getReg();
538 if (!
MRI->hasOneNonDBGUse(DefVR))
550 BlockedLoadsStoresPairs.
push_back(std::make_pair(&
MI, &StoreMI));
559 return TRI->getRegSizeInBits(*TRC) / 8;
562void X86AvoidSFBPass::breakBlockedCopies(
564 const DisplacementSizeMap &BlockingStoresDispSizeMap) {
567 int64_t LMMOffset = 0;
568 int64_t SMMOffset = 0;
570 int64_t LdDisp1 = LdDispImm;
572 int64_t StDisp1 = StDispImm;
576 int64_t LdStDelta = StDispImm - LdDispImm;
578 for (
auto DispSizePair : BlockingStoresDispSizeMap) {
579 LdDisp2 = DispSizePair.first;
580 StDisp2 = DispSizePair.first + LdStDelta;
581 Size2 = DispSizePair.second;
583 if (LdDisp2 < LdDisp1) {
584 int OverlapDelta = LdDisp1 - LdDisp2;
585 LdDisp2 += OverlapDelta;
586 StDisp2 += OverlapDelta;
587 Size2 -= OverlapDelta;
589 Size1 = LdDisp2 - LdDisp1;
598 LdDisp1 = LdDisp2 + Size2;
599 StDisp1 = StDisp2 + Size2;
600 LMMOffset += Size1 + Size2;
601 SMMOffset += Size1 + Size2;
603 unsigned Size3 = (LdDispImm + getRegSizeInBytes(
LoadInst)) - LdDisp1;
614 if (LoadBase.
isReg())
620 int64_t StoreDispImm,
unsigned StoreSize) {
621 return ((StoreDispImm >= LoadDispImm) &&
622 (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
628 int64_t DispImm,
unsigned Size) {
629 if (BlockingStoresDispSizeMap.count(DispImm)) {
631 if (BlockingStoresDispSizeMap[DispImm] >
Size)
632 BlockingStoresDispSizeMap[DispImm] =
Size;
635 BlockingStoresDispSizeMap[DispImm] =
Size;
641 if (BlockingStoresDispSizeMap.size() <= 1)
645 for (
auto DispSizePair : BlockingStoresDispSizeMap) {
646 int64_t CurrDisp = DispSizePair.first;
647 unsigned CurrSize = DispSizePair.second;
648 while (DispSizeStack.
size()) {
649 int64_t PrevDisp = DispSizeStack.
back().first;
650 unsigned PrevSize = DispSizeStack.
back().second;
651 if (CurrDisp + CurrSize > PrevDisp + PrevSize)
657 BlockingStoresDispSizeMap.clear();
658 for (
auto Disp : DispSizeStack)
659 BlockingStoresDispSizeMap.insert(Disp);
663 bool Changed =
false;
670 assert(
MRI->isSSA() &&
"Expected MIR to be in SSA form");
673 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
676 findPotentiallylBlockedCopies(MF);
678 for (
auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
681 DisplacementSizeMap BlockingStoresDispSizeMap;
685 for (
auto *PBInst : PotentialBlockers) {
691 unsigned PBstSize = (*PBInst->memoperands_begin())->
getSize().getValue();
703 if (BlockingStoresDispSizeMap.empty())
720 for (
auto *RemovedInst : ForRemoval) {
721 RemovedInst->eraseFromParent();
724 BlockedLoadsStoresPairs.clear();
unsigned const MachineRegisterInfo * MRI
COFF::MachineTypes Machine
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode)
static bool isPotentialBlockedMemCpyLd(unsigned Opcode)
static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode)
static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode)
static bool isXMMLoadOpcode(unsigned Opcode)
static int getAddrOffset(const MachineInstr *MI)
static cl::opt< unsigned > X86AvoidSFBInspectionLimit("x86-sfb-inspection-limit", cl::desc("X86: Number of instructions backward to " "inspect for store forwarding blocks."), cl::init(20), cl::Hidden)
static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize, int64_t StoreDispImm, unsigned StoreSize)
static bool isRelevantAddressingMode(MachineInstr *MI)
static cl::opt< bool > DisableX86AvoidStoreForwardBlocks("x86-disable-avoid-SFB", cl::Hidden, cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false))
static void removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap)
static bool hasSameBaseOpValue(MachineInstr *LoadInst, MachineInstr *StoreInst)
static void updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap, int64_t DispImm, unsigned Size)
static MachineOperand & getBaseOperand(MachineInstr *MI)
static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode)
static SmallVector< MachineInstr *, 2 > findPotentialBlockers(MachineInstr *LoadInst)
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst)
static MachineOperand & getDispOperand(MachineInstr *MI)
static bool isYMMLoadOpcode(unsigned Opcode)
static const int MOV128SZ
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
FunctionPass class - This class is used to implement most global optimizations.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
An instruction for reading from memory.
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
instr_iterator instr_begin()
void push_back(MachineInstr *MI)
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Representation for a specific memory location.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Value * getOperand(unsigned i) const
void dump() const
Support for debugging, callable in GDB: V->dump()
const ParentTy * getParent() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
int getMemoryOperandNo(uint64_t TSFlags)
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It, then continue decrementing it while it points to a debug instruction.
FunctionPass * createX86AvoidStoreForwardingBlocks()
Return a pass that avoids creating store forward block issues in the hardware.