LLVM  3.7.0
X86VZeroUpper.cpp
Go to the documentation of this file.
1 //===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the pass which inserts x86 AVX vzeroupper instructions
11 // before calls to SSE encoded functions. This avoids transition latency
12 // penalty when transferring control between AVX encoded instructions and old
13 // SSE encoding mode.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "X86.h"
18 #include "X86InstrInfo.h"
19 #include "X86Subtarget.h"
20 #include "llvm/ADT/Statistic.h"
24 #include "llvm/CodeGen/Passes.h"
25 #include "llvm/Support/Debug.h"
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "x86-vzeroupper"
31 
32 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
33 
34 namespace {
35 
36  class VZeroUpperInserter : public MachineFunctionPass {
37  public:
38 
39  VZeroUpperInserter() : MachineFunctionPass(ID) {}
40  bool runOnMachineFunction(MachineFunction &MF) override;
41  const char *getPassName() const override {return "X86 vzeroupper inserter";}
42 
43  private:
44 
45  void processBasicBlock(MachineBasicBlock &MBB);
46  void insertVZeroUpper(MachineBasicBlock::iterator I,
47  MachineBasicBlock &MBB);
48  void addDirtySuccessor(MachineBasicBlock &MBB);
49 
50  typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
51  static const char* getBlockExitStateName(BlockExitState ST);
52 
53  // Core algorithm state:
54  // BlockState - Each block is either:
55  // - PASS_THROUGH: There are neither YMM dirtying instructions nor
56  // vzeroupper instructions in this block.
57  // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
58  // block that will ensure that YMM is clean on exit.
59  // - EXITS_DIRTY: An instruction in the block dirties YMM and no
60  // subsequent vzeroupper in the block clears it.
61  //
62  // AddedToDirtySuccessors - This flag is raised when a block is added to the
63  // DirtySuccessors list to ensure that it's not
64  // added multiple times.
65  //
66  // FirstUnguardedCall - Records the location of the first unguarded call in
67  // each basic block that may need to be guarded by a
68  // vzeroupper. We won't know whether it actually needs
69  // to be guarded until we discover a predecessor that
70  // is DIRTY_OUT.
71  struct BlockState {
72  BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
73  BlockExitState ExitState;
74  bool AddedToDirtySuccessors;
75  MachineBasicBlock::iterator FirstUnguardedCall;
76  };
77  typedef SmallVector<BlockState, 8> BlockStateMap;
78  typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
79 
80  BlockStateMap BlockStates;
81  DirtySuccessorsWorkList DirtySuccessors;
82  bool EverMadeChange;
83  const TargetInstrInfo *TII;
84 
85  static char ID;
86  };
87 
88  char VZeroUpperInserter::ID = 0;
89 }
90 
92  return new VZeroUpperInserter();
93 }
94 
95 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
96  switch (ST) {
97  case PASS_THROUGH: return "Pass-through";
98  case EXITS_DIRTY: return "Exits-dirty";
99  case EXITS_CLEAN: return "Exits-clean";
100  }
101  llvm_unreachable("Invalid block exit state.");
102 }
103 
104 static bool isYmmReg(unsigned Reg) {
105  return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
106 }
107 
110  E = MRI.livein_end(); I != E; ++I)
111  if (isYmmReg(I->first))
112  return true;
113 
114  return false;
115 }
116 
117 static bool clobbersAllYmmRegs(const MachineOperand &MO) {
118  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
119  if (!MO.clobbersPhysReg(reg))
120  return false;
121  }
122  return true;
123 }
124 
125 static bool hasYmmReg(MachineInstr *MI) {
126  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
127  const MachineOperand &MO = MI->getOperand(i);
128  if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
129  return true;
130  if (!MO.isReg())
131  continue;
132  if (MO.isDebug())
133  continue;
134  if (isYmmReg(MO.getReg()))
135  return true;
136  }
137  return false;
138 }
139 
140 /// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
141 /// instruction.
143  assert(MI->isCall() && "Can only be called on call instructions.");
144  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
145  const MachineOperand &MO = MI->getOperand(i);
146  if (!MO.isRegMask())
147  continue;
148  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
149  if (MO.clobbersPhysReg(reg))
150  return true;
151  }
152  }
153  return false;
154 }
155 
156 // Insert a vzeroupper instruction before I.
157 void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
158  MachineBasicBlock &MBB) {
159  DebugLoc dl = I->getDebugLoc();
160  BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
161  ++NumVZU;
162  EverMadeChange = true;
163 }
164 
165 // Add MBB to the DirtySuccessors list if it hasn't already been added.
166 void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
167  if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
168  DirtySuccessors.push_back(&MBB);
169  BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
170  }
171 }
172 
173 /// processBasicBlock - Loop over all of the instructions in the basic block,
174 /// inserting vzeroupper instructions before function calls.
175 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
176 
177  // Start by assuming that the block PASS_THROUGH, which implies no unguarded
178  // calls.
179  BlockExitState CurState = PASS_THROUGH;
180  BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
181 
182  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
183  MachineInstr *MI = I;
184  bool isControlFlow = MI->isCall() || MI->isReturn();
185 
186  // Shortcut: don't need to check regular instructions in dirty state.
187  if (!isControlFlow && CurState == EXITS_DIRTY)
188  continue;
189 
190  if (hasYmmReg(MI)) {
191  // We found a ymm-using instruction; this could be an AVX instruction,
192  // or it could be control flow.
193  CurState = EXITS_DIRTY;
194  continue;
195  }
196 
197  // Check for control-flow out of the current function (which might
198  // indirectly execute SSE instructions).
199  if (!isControlFlow)
200  continue;
201 
202  // If the call won't clobber any YMM register, skip it as well. It usually
203  // happens on helper function calls (such as '_chkstk', '_ftol2') where
204  // standard calling convention is not used (RegMask is not used to mark
205  // register clobbered and register usage (def/imp-def/use) is well-defined
206  // and explicitly specified.
207  if (MI->isCall() && !callClobbersAnyYmmReg(MI))
208  continue;
209 
210  // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
211  // registers. This instruction has zero latency. In addition, the processor
212  // changes back to Clean state, after which execution of Intel SSE
213  // instructions or Intel AVX instructions has no transition penalty. Add
214  // the VZEROUPPER instruction before any function call/return that might
215  // execute SSE code.
216  // FIXME: In some cases, we may want to move the VZEROUPPER into a
217  // predecessor block.
218  if (CurState == EXITS_DIRTY) {
219  // After the inserted VZEROUPPER the state becomes clean again, but
220  // other YMM may appear before other subsequent calls or even before
221  // the end of the BB.
222  insertVZeroUpper(I, MBB);
223  CurState = EXITS_CLEAN;
224  } else if (CurState == PASS_THROUGH) {
225  // If this block is currently in pass-through state and we encounter a
226  // call then whether we need a vzeroupper or not depends on whether this
227  // block has successors that exit dirty. Record the location of the call,
228  // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
229  // It will be inserted later if necessary.
230  BlockStates[MBB.getNumber()].FirstUnguardedCall = I;
231  CurState = EXITS_CLEAN;
232  }
233  }
234 
235  DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
236  << getBlockExitStateName(CurState) << '\n');
237 
238  if (CurState == EXITS_DIRTY)
240  SE = MBB.succ_end();
241  SI != SE; ++SI)
242  addDirtySuccessor(**SI);
243 
244  BlockStates[MBB.getNumber()].ExitState = CurState;
245 }
246 
247 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
248 /// vzeroupper instructions before function calls.
249 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
250  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
251  if (!ST.hasAVX() || ST.hasAVX512())
252  return false;
253  TII = ST.getInstrInfo();
254  MachineRegisterInfo &MRI = MF.getRegInfo();
255  EverMadeChange = false;
256 
257  bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
258 
259  // Fast check: if the function doesn't use any ymm registers, we don't need
260  // to insert any VZEROUPPER instructions. This is constant-time, so it is
261  // cheap in the common case of no ymm use.
262  bool YMMUsed = FnHasLiveInYmm;
263  if (!YMMUsed) {
264  const TargetRegisterClass *RC = &X86::VR256RegClass;
265  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
266  i++) {
267  if (!MRI.reg_nodbg_empty(*i)) {
268  YMMUsed = true;
269  break;
270  }
271  }
272  }
273  if (!YMMUsed) {
274  return false;
275  }
276 
277  assert(BlockStates.empty() && DirtySuccessors.empty() &&
278  "X86VZeroUpper state should be clear");
279  BlockStates.resize(MF.getNumBlockIDs());
280 
281  // Process all blocks. This will compute block exit states, record the first
282  // unguarded call in each block, and add successors of dirty blocks to the
283  // DirtySuccessors list.
284  for (MachineBasicBlock &MBB : MF)
285  processBasicBlock(MBB);
286 
287  // If any YMM regs are live in to this function, add the entry block to the
288  // DirtySuccessors list
289  if (FnHasLiveInYmm)
290  addDirtySuccessor(MF.front());
291 
292  // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add
293  // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
294  // through PASS_THROUGH blocks.
295  while (!DirtySuccessors.empty()) {
296  MachineBasicBlock &MBB = *DirtySuccessors.back();
297  DirtySuccessors.pop_back();
298  BlockState &BBState = BlockStates[MBB.getNumber()];
299 
300  // MBB is a successor of a dirty block, so its first call needs to be
301  // guarded.
302  if (BBState.FirstUnguardedCall != MBB.end())
303  insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
304 
305  // If this successor was a pass-through block then it is now dirty, and its
306  // successors need to be added to the worklist (if they haven't been
307  // already).
308  if (BBState.ExitState == PASS_THROUGH) {
309  DEBUG(dbgs() << "MBB #" << MBB.getNumber()
310  << " was Pass-through, is now Dirty-out.\n");
312  SE = MBB.succ_end();
313  SI != SE; ++SI)
314  addDirtySuccessor(**SI);
315  }
316  }
317 
318  BlockStates.clear();
319  return EverMadeChange;
320 }
STATISTIC(NumFunctions,"Total number of functions")
livein_iterator livein_end() const
int getNumber() const
getNumber - MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a M...
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:262
A debug info location.
Definition: DebugLoc.h:34
FunctionPass * createX86IssueVZeroUpperPass()
createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions before each call to avoi...
unsigned getNumBlockIDs() const
getNumBlockIDs - Return the number of MBB ID's allocated.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:98
bool isReg() const
isReg - Tests if this is a MO_Register operand.
std::vector< MachineBasicBlock * >::iterator succ_iterator
iterator begin() const
begin/end - Return all of the registers in this class.
Reg
All possible values of the reg field in the ModR/M byte.
#define false
Definition: ConvertUTF.c:65
unsigned getNumOperands() const
Access to explicit operands of the instruction.
Definition: MachineInstr.h:271
static bool hasYmmReg(MachineInstr *MI)
TargetInstrInfo - Interface to description of machine instruction set.
bundle_iterator< MachineInstr, instr_iterator > iterator
static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI)
bool isReturn(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:399
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:273
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:294
MachineInstrBuilder BuildMI(MachineFunction &MF, DebugLoc DL, const MCInstrDesc &MCID)
BuildMI - Builder interface.
bool isRegMask() const
isRegMask - Tests if this is a MO_RegisterMask operand.
MachineOperand class - Representation of each machine instruction operand.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:861
livein_iterator livein_begin() const
static bool isYmmReg(unsigned Reg)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:123
static bool clobbersPhysReg(const uint32_t *RegMask, unsigned PhysReg)
clobbersPhysReg - Returns true if this RegMask clobbers PhysReg.
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:51
static bool callClobbersAnyYmmReg(MachineInstr *MI)
clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this instruction.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:54
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:403
bool hasAVX512() const
Definition: X86Subtarget.h:331
std::vector< std::pair< unsigned, unsigned > >::const_iterator livein_iterator
unsigned getReg() const
getReg - Returns the register number.
static bool clobbersAllYmmRegs(const MachineOperand &MO)
bool isDebug() const
#define DEBUG(X)
Definition: Debug.h:92
bool reg_nodbg_empty(unsigned RegNo) const
reg_nodbg_empty - Return true if the only instructions using or defining Reg are Debug instructions...
bool hasAVX() const
Definition: X86Subtarget.h:329