LLVM  10.0.0svn
SIOptimizeExecMasking.cpp
Go to the documentation of this file.
1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AMDGPU.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
13 #include "llvm/ADT/SmallSet.h"
17 #include "llvm/Support/Debug.h"
18 
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "si-optimize-exec-masking"
22 
23 namespace {
24 
25 class SIOptimizeExecMasking : public MachineFunctionPass {
26 public:
27  static char ID;
28 
29 public:
30  SIOptimizeExecMasking() : MachineFunctionPass(ID) {
32  }
33 
34  bool runOnMachineFunction(MachineFunction &MF) override;
35 
36  StringRef getPassName() const override {
37  return "SI optimize exec mask operations";
38  }
39 
40  void getAnalysisUsage(AnalysisUsage &AU) const override {
41  AU.setPreservesCFG();
43  }
44 };
45 
46 } // End anonymous namespace.
47 
48 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
49  "SI optimize exec mask operations", false, false)
51 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
52  "SI optimize exec mask operations", false, false)
53 
54 char SIOptimizeExecMasking::ID = 0;
55 
56 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
57 
58 /// If \p MI is a copy from exec, return the register copied to.
60  switch (MI.getOpcode()) {
61  case AMDGPU::COPY:
62  case AMDGPU::S_MOV_B64:
63  case AMDGPU::S_MOV_B64_term:
64  case AMDGPU::S_MOV_B32:
65  case AMDGPU::S_MOV_B32_term: {
66  const MachineOperand &Src = MI.getOperand(1);
67  if (Src.isReg() &&
68  Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC))
69  return MI.getOperand(0).getReg();
70  }
71  }
72 
73  return AMDGPU::NoRegister;
74 }
75 
76 /// If \p MI is a copy to exec, return the register copied from.
77 static unsigned isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) {
78  switch (MI.getOpcode()) {
79  case AMDGPU::COPY:
80  case AMDGPU::S_MOV_B64:
81  case AMDGPU::S_MOV_B32: {
82  const MachineOperand &Dst = MI.getOperand(0);
83  if (Dst.isReg() &&
84  Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) &&
85  MI.getOperand(1).isReg())
86  return MI.getOperand(1).getReg();
87  break;
88  }
89  case AMDGPU::S_MOV_B64_term:
90  case AMDGPU::S_MOV_B32_term:
91  llvm_unreachable("should have been replaced");
92  }
93 
94  return AMDGPU::NoRegister;
95 }
96 
97 /// If \p MI is a logical operation on an exec value,
98 /// return the register copied to.
99 static unsigned isLogicalOpOnExec(const MachineInstr &MI) {
100  switch (MI.getOpcode()) {
101  case AMDGPU::S_AND_B64:
102  case AMDGPU::S_OR_B64:
103  case AMDGPU::S_XOR_B64:
104  case AMDGPU::S_ANDN2_B64:
105  case AMDGPU::S_ORN2_B64:
106  case AMDGPU::S_NAND_B64:
107  case AMDGPU::S_NOR_B64:
108  case AMDGPU::S_XNOR_B64: {
109  const MachineOperand &Src1 = MI.getOperand(1);
110  if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC)
111  return MI.getOperand(0).getReg();
112  const MachineOperand &Src2 = MI.getOperand(2);
113  if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
114  return MI.getOperand(0).getReg();
115  break;
116  }
117  case AMDGPU::S_AND_B32:
118  case AMDGPU::S_OR_B32:
119  case AMDGPU::S_XOR_B32:
120  case AMDGPU::S_ANDN2_B32:
121  case AMDGPU::S_ORN2_B32:
122  case AMDGPU::S_NAND_B32:
123  case AMDGPU::S_NOR_B32:
124  case AMDGPU::S_XNOR_B32: {
125  const MachineOperand &Src1 = MI.getOperand(1);
126  if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO)
127  return MI.getOperand(0).getReg();
128  const MachineOperand &Src2 = MI.getOperand(2);
129  if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO)
130  return MI.getOperand(0).getReg();
131  break;
132  }
133  }
134 
135  return AMDGPU::NoRegister;
136 }
137 
138 static unsigned getSaveExecOp(unsigned Opc) {
139  switch (Opc) {
140  case AMDGPU::S_AND_B64:
141  return AMDGPU::S_AND_SAVEEXEC_B64;
142  case AMDGPU::S_OR_B64:
143  return AMDGPU::S_OR_SAVEEXEC_B64;
144  case AMDGPU::S_XOR_B64:
145  return AMDGPU::S_XOR_SAVEEXEC_B64;
146  case AMDGPU::S_ANDN2_B64:
147  return AMDGPU::S_ANDN2_SAVEEXEC_B64;
148  case AMDGPU::S_ORN2_B64:
149  return AMDGPU::S_ORN2_SAVEEXEC_B64;
150  case AMDGPU::S_NAND_B64:
151  return AMDGPU::S_NAND_SAVEEXEC_B64;
152  case AMDGPU::S_NOR_B64:
153  return AMDGPU::S_NOR_SAVEEXEC_B64;
154  case AMDGPU::S_XNOR_B64:
155  return AMDGPU::S_XNOR_SAVEEXEC_B64;
156  case AMDGPU::S_AND_B32:
157  return AMDGPU::S_AND_SAVEEXEC_B32;
158  case AMDGPU::S_OR_B32:
159  return AMDGPU::S_OR_SAVEEXEC_B32;
160  case AMDGPU::S_XOR_B32:
161  return AMDGPU::S_XOR_SAVEEXEC_B32;
162  case AMDGPU::S_ANDN2_B32:
163  return AMDGPU::S_ANDN2_SAVEEXEC_B32;
164  case AMDGPU::S_ORN2_B32:
165  return AMDGPU::S_ORN2_SAVEEXEC_B32;
166  case AMDGPU::S_NAND_B32:
167  return AMDGPU::S_NAND_SAVEEXEC_B32;
168  case AMDGPU::S_NOR_B32:
169  return AMDGPU::S_NOR_SAVEEXEC_B32;
170  case AMDGPU::S_XNOR_B32:
171  return AMDGPU::S_XNOR_SAVEEXEC_B32;
172  default:
173  return AMDGPU::INSTRUCTION_LIST_END;
174  }
175 }
176 
177 // These are only terminators to get correct spill code placement during
178 // register allocation, so turn them back into normal instructions. Only one of
179 // these is expected per block.
181  switch (MI.getOpcode()) {
182  case AMDGPU::S_MOV_B64_term:
183  case AMDGPU::S_MOV_B32_term: {
184  MI.setDesc(TII.get(AMDGPU::COPY));
185  return true;
186  }
187  case AMDGPU::S_XOR_B64_term: {
188  // This is only a terminator to get the correct spill code placement during
189  // register allocation.
190  MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
191  return true;
192  }
193  case AMDGPU::S_XOR_B32_term: {
194  // This is only a terminator to get the correct spill code placement during
195  // register allocation.
196  MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
197  return true;
198  }
199  case AMDGPU::S_OR_B32_term: {
200  // This is only a terminator to get the correct spill code placement during
201  // register allocation.
202  MI.setDesc(TII.get(AMDGPU::S_OR_B32));
203  return true;
204  }
205  case AMDGPU::S_ANDN2_B64_term: {
206  // This is only a terminator to get the correct spill code placement during
207  // register allocation.
208  MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
209  return true;
210  }
211  case AMDGPU::S_ANDN2_B32_term: {
212  // This is only a terminator to get the correct spill code placement during
213  // register allocation.
214  MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
215  return true;
216  }
217  default:
218  return false;
219  }
220 }
221 
223  const SIInstrInfo &TII,
224  MachineBasicBlock &MBB) {
226  for (; I != E; ++I) {
227  if (!I->isTerminator())
228  return I;
229 
230  if (removeTerminatorBit(TII, *I))
231  return I;
232  }
233 
234  return E;
235 }
236 
238  const SIInstrInfo &TII,
239  const GCNSubtarget &ST,
240  MachineBasicBlock &MBB,
242  unsigned CopyToExec) {
243  const unsigned InstLimit = 25;
244 
245  auto E = MBB.rend();
246  for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
247  unsigned CopyFromExec = isCopyFromExec(*I, ST);
248  if (CopyFromExec != AMDGPU::NoRegister)
249  return I;
250  }
251 
252  return E;
253 }
254 
255 // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
256 // report the register as unavailable because a super-register with a lane mask
257 // is unavailable.
258 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
259  for (MachineBasicBlock *Succ : MBB.successors()) {
260  if (Succ->isLiveIn(Reg))
261  return true;
262  }
263 
264  return false;
265 }
266 
267 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
268  if (skipFunction(MF.getFunction()))
269  return false;
270 
271  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
272  const SIRegisterInfo *TRI = ST.getRegisterInfo();
273  const SIInstrInfo *TII = ST.getInstrInfo();
274  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
275 
276  // Optimize sequences emitted for control flow lowering. They are originally
277  // emitted as the separate operations because spill code may need to be
278  // inserted for the saved copy of exec.
279  //
280  // x = copy exec
281  // z = s_<op>_b64 x, y
282  // exec = copy z
283  // =>
284  // x = s_<op>_saveexec_b64 y
285  //
286 
287  for (MachineBasicBlock &MBB : MF) {
290  if (I == E)
291  continue;
292 
293  unsigned CopyToExec = isCopyToExec(*I, ST);
294  if (CopyToExec == AMDGPU::NoRegister)
295  continue;
296 
297  // Scan backwards to find the def.
298  auto CopyToExecInst = &*I;
299  auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec);
300  if (CopyFromExecInst == E) {
301  auto PrepareExecInst = std::next(I);
302  if (PrepareExecInst == E)
303  continue;
304  // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
305  if (CopyToExecInst->getOperand(1).isKill() &&
306  isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
307  LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
308 
309  PrepareExecInst->getOperand(0).setReg(Exec);
310 
311  LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
312 
313  CopyToExecInst->eraseFromParent();
314  }
315 
316  continue;
317  }
318 
319  if (isLiveOut(MBB, CopyToExec)) {
320  // The copied register is live out and has a second use in another block.
321  LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
322  continue;
323  }
324 
325  Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
326  MachineInstr *SaveExecInst = nullptr;
327  SmallVector<MachineInstr *, 4> OtherUseInsts;
328 
330  = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
331  J != JE; ++J) {
332  if (SaveExecInst && J->readsRegister(Exec, TRI)) {
333  LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
334  // Make sure this is inserted after any VALU ops that may have been
335  // scheduled in between.
336  SaveExecInst = nullptr;
337  break;
338  }
339 
340  bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI);
341 
342  if (J->modifiesRegister(CopyToExec, TRI)) {
343  if (SaveExecInst) {
344  LLVM_DEBUG(dbgs() << "Multiple instructions modify "
345  << printReg(CopyToExec, TRI) << '\n');
346  SaveExecInst = nullptr;
347  break;
348  }
349 
350  unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
351  if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
352  break;
353 
354  if (ReadsCopyFromExec) {
355  SaveExecInst = &*J;
356  LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
357  continue;
358  } else {
359  LLVM_DEBUG(dbgs()
360  << "Instruction does not read exec copy: " << *J << '\n');
361  break;
362  }
363  } else if (ReadsCopyFromExec && !SaveExecInst) {
364  // Make sure no other instruction is trying to use this copy, before it
365  // will be rewritten by the saveexec, i.e. hasOneUse. There may have
366  // been another use, such as an inserted spill. For example:
367  //
368  // %sgpr0_sgpr1 = COPY %exec
369  // spill %sgpr0_sgpr1
370  // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
371  //
372  LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
373  << '\n');
374  break;
375  }
376 
377  if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
378  assert(SaveExecInst != &*J);
379  OtherUseInsts.push_back(&*J);
380  }
381  }
382 
383  if (!SaveExecInst)
384  continue;
385 
386  LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
387 
388  MachineOperand &Src0 = SaveExecInst->getOperand(1);
389  MachineOperand &Src1 = SaveExecInst->getOperand(2);
390 
391  MachineOperand *OtherOp = nullptr;
392 
393  if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
394  OtherOp = &Src1;
395  } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
396  if (!SaveExecInst->isCommutable())
397  break;
398 
399  OtherOp = &Src0;
400  } else
401  llvm_unreachable("unexpected");
402 
403  CopyFromExecInst->eraseFromParent();
404 
405  auto InsPt = SaveExecInst->getIterator();
406  const DebugLoc &DL = SaveExecInst->getDebugLoc();
407 
408  BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
409  CopyFromExec)
410  .addReg(OtherOp->getReg());
411  SaveExecInst->eraseFromParent();
412 
413  CopyToExecInst->eraseFromParent();
414 
415  for (MachineInstr *OtherInst : OtherUseInsts) {
416  OtherInst->substituteRegister(CopyToExec, Exec,
417  AMDGPU::NoSubRegister, *TRI);
418  }
419  }
420 
421  return true;
422 
423 }
static unsigned isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST)
If MI is a copy from exec, return the register copied to.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
void push_back(const T &Elt)
Definition: SmallVector.h:211
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:384
unsigned Reg
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
static unsigned isLogicalOpOnExec(const MachineInstr &MI)
If MI is a logical operation on an exec value, return the register copied to.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI)
static MachineBasicBlock::reverse_iterator findExecCopy(const SIInstrInfo &TII, const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I, unsigned CopyToExec)
static MachineBasicBlock::reverse_iterator fixTerminators(const SIInstrInfo &TII, MachineBasicBlock &MBB)
iterator_range< succ_iterator > successors()
static unsigned isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST)
If MI is a copy to exec, return the register copied from.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i...
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:410
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
SI optimize exec mask operations
#define DEBUG_TYPE
reverse_iterator rend()
reverse_iterator rbegin()
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
self_iterator getIterator()
Definition: ilist_node.h:81
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
char & SIOptimizeExecMaskingID
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
MachineOperand class - Representation of each machine instruction operand.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
Interface definition for SIInstrInfo.
#define I(x, y, z)
Definition: MD5.cpp:58
#define N
INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, "SI optimize exec mask operations", false, false) INITIALIZE_PASS_END(SIOptimizeExecMasking
bool isReg() const
isReg - Tests if this is a MO_Register operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
aarch64 promote const
static unsigned getSaveExecOp(unsigned Opc)
static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg)
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
Register getReg() const
getReg - Returns the register number.
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:415
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z, ..."), which produces the same result if Y and Z are exchanged.
Definition: MachineInstr.h:882
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
const SIRegisterInfo * getRegisterInfo() const override