LLVM  4.0.0
X86VZeroUpper.cpp
Go to the documentation of this file.
1 //===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the pass which inserts x86 AVX vzeroupper instructions
11 // before calls to SSE encoded functions. This avoids transition latency
12 // penalty when transferring control between AVX encoded instructions and old
13 // SSE encoding mode.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "X86.h"
18 #include "X86InstrInfo.h"
19 #include "X86Subtarget.h"
20 #include "llvm/ADT/Statistic.h"
24 #include "llvm/CodeGen/Passes.h"
25 #include "llvm/Support/Debug.h"
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "x86-vzeroupper"
31 
32 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
33 
34 namespace {
35 
36  class VZeroUpperInserter : public MachineFunctionPass {
37  public:
38 
39  VZeroUpperInserter() : MachineFunctionPass(ID) {}
40  bool runOnMachineFunction(MachineFunction &MF) override;
41  MachineFunctionProperties getRequiredProperties() const override {
44  }
45  StringRef getPassName() const override { return "X86 vzeroupper inserter"; }
46 
47  private:
48 
49  void processBasicBlock(MachineBasicBlock &MBB);
50  void insertVZeroUpper(MachineBasicBlock::iterator I,
52  void addDirtySuccessor(MachineBasicBlock &MBB);
53 
54  typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
55  static const char* getBlockExitStateName(BlockExitState ST);
56 
57  // Core algorithm state:
58  // BlockState - Each block is either:
59  // - PASS_THROUGH: There are neither YMM dirtying instructions nor
60  // vzeroupper instructions in this block.
61  // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
62  // block that will ensure that YMM is clean on exit.
63  // - EXITS_DIRTY: An instruction in the block dirties YMM and no
64  // subsequent vzeroupper in the block clears it.
65  //
66  // AddedToDirtySuccessors - This flag is raised when a block is added to the
67  // DirtySuccessors list to ensure that it's not
68  // added multiple times.
69  //
70  // FirstUnguardedCall - Records the location of the first unguarded call in
71  // each basic block that may need to be guarded by a
72  // vzeroupper. We won't know whether it actually needs
73  // to be guarded until we discover a predecessor that
74  // is DIRTY_OUT.
75  struct BlockState {
76  BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
77  BlockExitState ExitState;
78  bool AddedToDirtySuccessors;
79  MachineBasicBlock::iterator FirstUnguardedCall;
80  };
81  typedef SmallVector<BlockState, 8> BlockStateMap;
82  typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
83 
84  BlockStateMap BlockStates;
85  DirtySuccessorsWorkList DirtySuccessors;
86  bool EverMadeChange;
87  bool IsX86INTR;
88  const TargetInstrInfo *TII;
89 
90  static char ID;
91  };
92 
93  char VZeroUpperInserter::ID = 0;
94 }
95 
97  return new VZeroUpperInserter();
98 }
99 
100 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
101  switch (ST) {
102  case PASS_THROUGH: return "Pass-through";
103  case EXITS_DIRTY: return "Exits-dirty";
104  case EXITS_CLEAN: return "Exits-clean";
105  }
106  llvm_unreachable("Invalid block exit state.");
107 }
108 
109 static bool isYmmReg(unsigned Reg) {
110  return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
111 }
112 
115  E = MRI.livein_end(); I != E; ++I)
116  if (isYmmReg(I->first))
117  return true;
118 
119  return false;
120 }
121 
122 static bool clobbersAllYmmRegs(const MachineOperand &MO) {
123  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
124  if (!MO.clobbersPhysReg(reg))
125  return false;
126  }
127  return true;
128 }
129 
130 static bool hasYmmReg(MachineInstr &MI) {
131  for (const MachineOperand &MO : MI.operands()) {
132  if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
133  return true;
134  if (!MO.isReg())
135  continue;
136  if (MO.isDebug())
137  continue;
138  if (isYmmReg(MO.getReg()))
139  return true;
140  }
141  return false;
142 }
143 
144 /// Check if any YMM register will be clobbered by this instruction.
146  assert(MI.isCall() && "Can only be called on call instructions.");
147  for (const MachineOperand &MO : MI.operands()) {
148  if (!MO.isRegMask())
149  continue;
150  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
151  if (MO.clobbersPhysReg(reg))
152  return true;
153  }
154  }
155  return false;
156 }
157 
158 /// Insert a vzeroupper instruction before I.
159 void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
161  DebugLoc dl = I->getDebugLoc();
162  BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
163  ++NumVZU;
164  EverMadeChange = true;
165 }
166 
167 /// Add MBB to the DirtySuccessors list if it hasn't already been added.
168 void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
169  if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
170  DirtySuccessors.push_back(&MBB);
171  BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
172  }
173 }
174 
175 /// Loop over all of the instructions in the basic block, inserting vzeroupper
176 /// instructions before function calls.
177 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
178 
179  // Start by assuming that the block is PASS_THROUGH which implies no unguarded
180  // calls.
181  BlockExitState CurState = PASS_THROUGH;
182  BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
183 
184  for (MachineInstr &MI : MBB) {
185  // No need for vzeroupper before iret in interrupt handler function,
186  // epilogue will restore YMM registers if needed.
187  bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn();
188  bool IsControlFlow = MI.isCall() || MI.isReturn();
189 
190  // An existing VZERO* instruction resets the state.
191  if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
192  CurState = EXITS_CLEAN;
193  continue;
194  }
195 
196  // Shortcut: don't need to check regular instructions in dirty state.
197  if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
198  continue;
199 
200  if (hasYmmReg(MI)) {
201  // We found a ymm-using instruction; this could be an AVX instruction,
202  // or it could be control flow.
203  CurState = EXITS_DIRTY;
204  continue;
205  }
206 
207  // Check for control-flow out of the current function (which might
208  // indirectly execute SSE instructions).
209  if (!IsControlFlow || IsReturnFromX86INTR)
210  continue;
211 
212  // If the call won't clobber any YMM register, skip it as well. It usually
213  // happens on helper function calls (such as '_chkstk', '_ftol2') where
214  // standard calling convention is not used (RegMask is not used to mark
215  // register clobbered and register usage (def/imp-def/use) is well-defined
216  // and explicitly specified.
217  if (MI.isCall() && !callClobbersAnyYmmReg(MI))
218  continue;
219 
220  // The VZEROUPPER instruction resets the upper 128 bits of all AVX
221  // registers. In addition, the processor changes back to Clean state, after
222  // which execution of SSE instructions or AVX instructions has no transition
223  // penalty. Add the VZEROUPPER instruction before any function call/return
224  // that might execute SSE code.
225  // FIXME: In some cases, we may want to move the VZEROUPPER into a
226  // predecessor block.
227  if (CurState == EXITS_DIRTY) {
228  // After the inserted VZEROUPPER the state becomes clean again, but
229  // other YMM may appear before other subsequent calls or even before
230  // the end of the BB.
231  insertVZeroUpper(MI, MBB);
232  CurState = EXITS_CLEAN;
233  } else if (CurState == PASS_THROUGH) {
234  // If this block is currently in pass-through state and we encounter a
235  // call then whether we need a vzeroupper or not depends on whether this
236  // block has successors that exit dirty. Record the location of the call,
237  // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
238  // It will be inserted later if necessary.
239  BlockStates[MBB.getNumber()].FirstUnguardedCall = MI;
240  CurState = EXITS_CLEAN;
241  }
242  }
243 
244  DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
245  << getBlockExitStateName(CurState) << '\n');
246 
247  if (CurState == EXITS_DIRTY)
248  for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
249  SE = MBB.succ_end();
250  SI != SE; ++SI)
251  addDirtySuccessor(**SI);
252 
253  BlockStates[MBB.getNumber()].ExitState = CurState;
254 }
255 
256 /// Loop over all of the basic blocks, inserting vzeroupper instructions before
257 /// function calls.
258 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
259  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
260  if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite())
261  return false;
262  TII = ST.getInstrInfo();
264  EverMadeChange = false;
265  IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
266 
267  bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
268 
269  // Fast check: if the function doesn't use any ymm registers, we don't need
270  // to insert any VZEROUPPER instructions. This is constant-time, so it is
271  // cheap in the common case of no ymm use.
272  bool YMMUsed = FnHasLiveInYmm;
273  if (!YMMUsed) {
274  const TargetRegisterClass *RC = &X86::VR256RegClass;
275  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
276  i++) {
277  if (!MRI.reg_nodbg_empty(*i)) {
278  YMMUsed = true;
279  break;
280  }
281  }
282  }
283  if (!YMMUsed) {
284  return false;
285  }
286 
287  assert(BlockStates.empty() && DirtySuccessors.empty() &&
288  "X86VZeroUpper state should be clear");
289  BlockStates.resize(MF.getNumBlockIDs());
290 
291  // Process all blocks. This will compute block exit states, record the first
292  // unguarded call in each block, and add successors of dirty blocks to the
293  // DirtySuccessors list.
294  for (MachineBasicBlock &MBB : MF)
295  processBasicBlock(MBB);
296 
297  // If any YMM regs are live-in to this function, add the entry block to the
298  // DirtySuccessors list
299  if (FnHasLiveInYmm)
300  addDirtySuccessor(MF.front());
301 
302  // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
303  // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
304  // through PASS_THROUGH blocks.
305  while (!DirtySuccessors.empty()) {
306  MachineBasicBlock &MBB = *DirtySuccessors.back();
307  DirtySuccessors.pop_back();
308  BlockState &BBState = BlockStates[MBB.getNumber()];
309 
310  // MBB is a successor of a dirty block, so its first call needs to be
311  // guarded.
312  if (BBState.FirstUnguardedCall != MBB.end())
313  insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
314 
315  // If this successor was a pass-through block, then it is now dirty. Its
316  // successors need to be added to the worklist (if they haven't been
317  // already).
318  if (BBState.ExitState == PASS_THROUGH) {
319  DEBUG(dbgs() << "MBB #" << MBB.getNumber()
320  << " was Pass-through, is now Dirty-out.\n");
321  for (MachineBasicBlock *Succ : MBB.successors())
322  addDirtySuccessor(*Succ);
323  }
324  }
325 
326  BlockStates.clear();
327  return EverMadeChange;
328 }
STATISTIC(NumFunctions,"Total number of functions")
size_t i
livein_iterator livein_end() const
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:344
bool hasFastPartialYMMWrite() const
Definition: X86Subtarget.h:465
A debug info location.
Definition: DebugLoc.h:34
const Function * getFunction() const
getFunction - Return the LLVM function that this machine code represents
FunctionPass * createX86IssueVZeroUpperPass()
This pass inserts AVX vzeroupper instructions before each call to avoid transition penalty between fu...
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:301
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:165
iterator_range< succ_iterator > successors()
unsigned getNumBlockIDs() const
getNumBlockIDs - Return the number of MBB ID's allocated.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
X86_INTR - x86 hardware interrupt context.
Definition: CallingConv.h:169
std::vector< MachineBasicBlock * >::iterator succ_iterator
iterator begin() const
begin/end - Return all of the registers in this class.
Reg
All possible values of the reg field in the ModR/M byte.
MachineBasicBlock * MBB
Function Alias Analysis false
static bool hasYmmReg(MachineInstr &MI)
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
TargetInstrInfo - Interface to description of machine instruction set.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI)
unsigned const MachineRegisterInfo * MRI
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
MachineOperand class - Representation of each machine instruction operand.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:843
livein_iterator livein_begin() const
static bool isYmmReg(unsigned Reg)
static bool callClobbersAnyYmmReg(MachineInstr &MI)
Check if any YMM register will be clobbered by this instruction.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
static bool clobbersPhysReg(const uint32_t *RegMask, unsigned PhysReg)
clobbersPhysReg - Returns true if this RegMask clobbers PhysReg.
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
MachineFunctionProperties & set(Property P)
Representation of each machine instruction.
Definition: MachineInstr.h:52
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:54
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:424
bool hasAVX512() const
Definition: X86Subtarget.h:418
std::vector< std::pair< unsigned, unsigned > >::const_iterator livein_iterator
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool clobbersAllYmmRegs(const MachineOperand &MO)
#define DEBUG(X)
Definition: Debug.h:100
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:47
bool reg_nodbg_empty(unsigned RegNo) const
reg_nodbg_empty - Return true if the only instructions using or defining Reg are Debug instructions...
bool hasAVX() const
Definition: X86Subtarget.h:416
Properties which a MachineFunction may have at a given point in time.