54#define DEBUG_TYPE "x86-avoid-SFB"
61 "x86-sfb-inspection-limit",
62 cl::desc(
"X86: Number of instructions backward to "
63 "inspect for store forwarding blocks."),
68using DisplacementSizeMap = std::map<int64_t, unsigned>;
76 return "X86 Avoid Store Forwarding Blocks";
91 BlockedLoadsStoresPairs;
102 const DisplacementSizeMap &BlockingStoresDispSizeMap);
106 int64_t LMMOffset, int64_t SMMOffset);
110 int64_t StoreDisp,
unsigned Size, int64_t LMMOffset,
120char X86AvoidSFBPass::ID = 0;
129 return new X86AvoidSFBPass();
133 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
134 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
135 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
136 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
137 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
138 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
139 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
140 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
143 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
144 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
145 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
146 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
147 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
148 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
149 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
160 return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
163 return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
166 return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
169 return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
170 case X86::VMOVUPSZ128rm:
171 case X86::VMOVAPSZ128rm:
172 return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
173 case X86::VMOVUPDZ128rm:
174 case X86::VMOVAPDZ128rm:
175 return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
176 case X86::VMOVUPSYrm:
177 case X86::VMOVAPSYrm:
178 return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
179 case X86::VMOVUPDYrm:
180 case X86::VMOVAPDYrm:
181 return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
182 case X86::VMOVDQUYrm:
183 case X86::VMOVDQAYrm:
184 return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
185 case X86::VMOVUPSZ256rm:
186 case X86::VMOVAPSZ256rm:
187 return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
188 case X86::VMOVUPDZ256rm:
189 case X86::VMOVAPDZ256rm:
190 return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
191 case X86::VMOVDQU64Z128rm:
192 case X86::VMOVDQA64Z128rm:
193 return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
194 case X86::VMOVDQU32Z128rm:
195 case X86::VMOVDQA32Z128rm:
196 return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
197 case X86::VMOVDQU64Z256rm:
198 case X86::VMOVDQA64Z256rm:
199 return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
200 case X86::VMOVDQU32Z256rm:
201 case X86::VMOVDQA32Z256rm:
202 return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
210 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
211 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
212 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
213 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
215 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
216 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
217 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
218 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
219 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
220 Opcode == X86::VMOVDQU64Z128mr ||
221 Opcode == X86::VMOVDQA64Z128mr ||
222 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
233 switch (LoadOpcode) {
234 case X86::VMOVUPSYrm:
235 case X86::VMOVAPSYrm:
236 return X86::VMOVUPSrm;
237 case X86::VMOVUPDYrm:
238 case X86::VMOVAPDYrm:
239 return X86::VMOVUPDrm;
240 case X86::VMOVDQUYrm:
241 case X86::VMOVDQAYrm:
242 return X86::VMOVDQUrm;
243 case X86::VMOVUPSZ256rm:
244 case X86::VMOVAPSZ256rm:
245 return X86::VMOVUPSZ128rm;
246 case X86::VMOVUPDZ256rm:
247 case X86::VMOVAPDZ256rm:
248 return X86::VMOVUPDZ128rm;
249 case X86::VMOVDQU64Z256rm:
250 case X86::VMOVDQA64Z256rm:
251 return X86::VMOVDQU64Z128rm;
252 case X86::VMOVDQU32Z256rm:
253 case X86::VMOVDQA32Z256rm:
254 return X86::VMOVDQU32Z128rm;
262 switch (StoreOpcode) {
263 case X86::VMOVUPSYmr:
264 case X86::VMOVAPSYmr:
265 return X86::VMOVUPSmr;
266 case X86::VMOVUPDYmr:
267 case X86::VMOVAPDYmr:
268 return X86::VMOVUPDmr;
269 case X86::VMOVDQUYmr:
270 case X86::VMOVDQAYmr:
271 return X86::VMOVDQUmr;
272 case X86::VMOVUPSZ256mr:
273 case X86::VMOVAPSZ256mr:
274 return X86::VMOVUPSZ128mr;
275 case X86::VMOVUPDZ256mr:
276 case X86::VMOVAPDZ256mr:
277 return X86::VMOVUPDZ128mr;
278 case X86::VMOVDQU64Z256mr:
279 case X86::VMOVDQA64Z256mr:
280 return X86::VMOVDQU64Z128mr;
281 case X86::VMOVDQU32Z256mr:
282 case X86::VMOVDQA32Z256mr:
283 return X86::VMOVDQU32Z128mr;
293 assert(AddrOffset != -1 &&
"Expected Memory Operand");
319 if (!((
Base.isReg() &&
Base.getReg() != X86::NoRegister) ||
Base.isFI()))
325 if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
327 if (!(Segment.
isReg() && Segment.
getReg() == X86::NoRegister))
340 unsigned BlockCount = 0;
344 PBInst != E; ++PBInst) {
345 if (PBInst->isMetaInstruction())
348 if (BlockCount >= InspectionLimit)
351 if (
MI.getDesc().isCall())
352 return PotentialBlockers;
359 if (BlockCount < InspectionLimit) {
361 int LimitLeft = InspectionLimit - BlockCount;
365 if (PBInst.isMetaInstruction())
368 if (PredCount >= LimitLeft)
370 if (PBInst.getDesc().isCall())
376 return PotentialBlockers;
381 unsigned NStoreOpcode, int64_t StoreDisp,
382 unsigned Size, int64_t LMMOffset,
402 if (LoadBase.
isReg())
410 if (PrevInstrIt.getNodePtr() ==
LoadInst)
422 if (StoreBase.
isReg())
425 assert(StoreSrcVReg.
isReg() &&
"Expected virtual register");
432 int64_t StDispImm, int64_t LMMOffset,
434 int LdDisp = LdDispImm;
435 int StDisp = StDispImm;
441 StDisp,
MOV128SZ, LMMOffset, SMMOffset);
451 MOV64SZ, LMMOffset, SMMOffset);
461 MOV32SZ, LMMOffset, SMMOffset);
471 MOV16SZ, LMMOffset, SMMOffset);
481 MOV8SZ, LMMOffset, SMMOffset);
495 auto *StorePrevNonDbgInstr =
499 if (LoadBase.
isReg()) {
505 if (StorePrevNonDbgInstr ==
LoadInst)
509 if (StoreBase.
isReg()) {
511 if (StorePrevNonDbgInstr ==
LoadInst)
526 return !AA->isNoAlias(
531void X86AvoidSFBPass::findPotentiallylBlockedCopies(
MachineFunction &MF) {
533 for (
auto &
MI :
MBB) {
536 int DefVR =
MI.getOperand(0).getReg();
537 if (!
MRI->hasOneNonDBGUse(DefVR))
549 BlockedLoadsStoresPairs.
push_back(std::make_pair(&
MI, &StoreMI));
558 return TRI->getRegSizeInBits(*TRC) / 8;
561void X86AvoidSFBPass::breakBlockedCopies(
563 const DisplacementSizeMap &BlockingStoresDispSizeMap) {
566 int64_t LMMOffset = 0;
567 int64_t SMMOffset = 0;
569 int64_t LdDisp1 = LdDispImm;
571 int64_t StDisp1 = StDispImm;
575 int64_t LdStDelta = StDispImm - LdDispImm;
577 for (
auto DispSizePair : BlockingStoresDispSizeMap) {
578 LdDisp2 = DispSizePair.first;
579 StDisp2 = DispSizePair.first + LdStDelta;
580 Size2 = DispSizePair.second;
582 if (LdDisp2 < LdDisp1) {
583 int OverlapDelta = LdDisp1 - LdDisp2;
584 LdDisp2 += OverlapDelta;
585 StDisp2 += OverlapDelta;
586 Size2 -= OverlapDelta;
588 Size1 = LdDisp2 - LdDisp1;
597 LdDisp1 = LdDisp2 + Size2;
598 StDisp1 = StDisp2 + Size2;
599 LMMOffset += Size1 + Size2;
600 SMMOffset += Size1 + Size2;
602 unsigned Size3 = (LdDispImm + getRegSizeInBytes(
LoadInst)) - LdDisp1;
613 if (LoadBase.
isReg())
619 int64_t StoreDispImm,
unsigned StoreSize) {
620 return ((StoreDispImm >= LoadDispImm) &&
621 (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
627 int64_t DispImm,
unsigned Size) {
628 if (BlockingStoresDispSizeMap.count(DispImm)) {
630 if (BlockingStoresDispSizeMap[DispImm] >
Size)
631 BlockingStoresDispSizeMap[DispImm] =
Size;
634 BlockingStoresDispSizeMap[DispImm] =
Size;
640 if (BlockingStoresDispSizeMap.size() <= 1)
644 for (
auto DispSizePair : BlockingStoresDispSizeMap) {
645 int64_t CurrDisp = DispSizePair.first;
646 unsigned CurrSize = DispSizePair.second;
647 while (DispSizeStack.
size()) {
648 int64_t PrevDisp = DispSizeStack.
back().first;
649 unsigned PrevSize = DispSizeStack.
back().second;
650 if (CurrDisp + CurrSize > PrevDisp + PrevSize)
656 BlockingStoresDispSizeMap.clear();
657 for (
auto Disp : DispSizeStack)
658 BlockingStoresDispSizeMap.insert(Disp);
662 bool Changed =
false;
669 assert(
MRI->isSSA() &&
"Expected MIR to be in SSA form");
672 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
675 findPotentiallylBlockedCopies(MF);
677 for (
auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
680 DisplacementSizeMap BlockingStoresDispSizeMap;
684 for (
auto *PBInst : PotentialBlockers) {
690 unsigned PBstSize = (*PBInst->memoperands_begin())->
getSize().getValue();
702 if (BlockingStoresDispSizeMap.empty())
719 for (
auto *RemovedInst : ForRemoval) {
720 RemovedInst->eraseFromParent();
723 BlockedLoadsStoresPairs.clear();
unsigned const MachineRegisterInfo * MRI
COFF::MachineTypes Machine
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode)
static bool isPotentialBlockedMemCpyLd(unsigned Opcode)
static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode)
static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode)
static bool isXMMLoadOpcode(unsigned Opcode)
static int getAddrOffset(const MachineInstr *MI)
static cl::opt< unsigned > X86AvoidSFBInspectionLimit("x86-sfb-inspection-limit", cl::desc("X86: Number of instructions backward to " "inspect for store forwarding blocks."), cl::init(20), cl::Hidden)
static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize, int64_t StoreDispImm, unsigned StoreSize)
static bool isRelevantAddressingMode(MachineInstr *MI)
static cl::opt< bool > DisableX86AvoidStoreForwardBlocks("x86-disable-avoid-SFB", cl::Hidden, cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false))
static void removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap)
static bool hasSameBaseOpValue(MachineInstr *LoadInst, MachineInstr *StoreInst)
static void updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap, int64_t DispImm, unsigned Size)
static MachineOperand & getBaseOperand(MachineInstr *MI)
static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode)
static SmallVector< MachineInstr *, 2 > findPotentialBlockers(MachineInstr *LoadInst)
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst)
static MachineOperand & getDispOperand(MachineInstr *MI)
static bool isYMMLoadOpcode(unsigned Opcode)
static const int MOV128SZ
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
FunctionPass class - This class is used to implement most global optimizations.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
An instruction for reading from memory.
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
instr_iterator instr_begin()
void push_back(MachineInstr *MI)
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Representation for a specific memory location.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Value * getOperand(unsigned i) const
void dump() const
Support for debugging, callable in GDB: V->dump()
const ParentTy * getParent() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
int getMemoryOperandNo(uint64_t TSFlags)
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It, then continue decrementing it while it points to a debug instruction.
FunctionPass * createX86AvoidStoreForwardingBlocks()
Return a pass that avoids creating store forward block issues in the hardware.