LLVM  15.0.0git
SIOptimizeExecMasking.cpp
Go to the documentation of this file.
1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AMDGPU.h"
10 #include "GCNSubtarget.h"
14 #include "llvm/InitializePasses.h"
15 
16 using namespace llvm;
17 
18 #define DEBUG_TYPE "si-optimize-exec-masking"
19 
20 namespace {
21 
22 class SIOptimizeExecMasking : public MachineFunctionPass {
23 public:
24  static char ID;
25 
26 public:
27  SIOptimizeExecMasking() : MachineFunctionPass(ID) {
29  }
30 
31  bool runOnMachineFunction(MachineFunction &MF) override;
32 
33  StringRef getPassName() const override {
34  return "SI optimize exec mask operations";
35  }
36 
37  void getAnalysisUsage(AnalysisUsage &AU) const override {
38  AU.setPreservesCFG();
40  }
41 };
42 
43 } // End anonymous namespace.
44 
45 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
46  "SI optimize exec mask operations", false, false)
48 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
49  "SI optimize exec mask operations", false, false)
50 
51 char SIOptimizeExecMasking::ID = 0;
52 
53 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
54 
55 /// If \p MI is a copy from exec, return the register copied to.
57  switch (MI.getOpcode()) {
58  case AMDGPU::COPY:
59  case AMDGPU::S_MOV_B64:
60  case AMDGPU::S_MOV_B64_term:
61  case AMDGPU::S_MOV_B32:
62  case AMDGPU::S_MOV_B32_term: {
63  const MachineOperand &Src = MI.getOperand(1);
64  if (Src.isReg() &&
65  Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC))
66  return MI.getOperand(0).getReg();
67  }
68  }
69 
70  return AMDGPU::NoRegister;
71 }
72 
73 /// If \p MI is a copy to exec, return the register copied from.
75  switch (MI.getOpcode()) {
76  case AMDGPU::COPY:
77  case AMDGPU::S_MOV_B64:
78  case AMDGPU::S_MOV_B32: {
79  const MachineOperand &Dst = MI.getOperand(0);
80  if (Dst.isReg() &&
81  Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) &&
82  MI.getOperand(1).isReg())
83  return MI.getOperand(1).getReg();
84  break;
85  }
86  case AMDGPU::S_MOV_B64_term:
87  case AMDGPU::S_MOV_B32_term:
88  llvm_unreachable("should have been replaced");
89  }
90 
91  return Register();
92 }
93 
94 /// If \p MI is a logical operation on an exec value,
95 /// return the register copied to.
97  switch (MI.getOpcode()) {
98  case AMDGPU::S_AND_B64:
99  case AMDGPU::S_OR_B64:
100  case AMDGPU::S_XOR_B64:
101  case AMDGPU::S_ANDN2_B64:
102  case AMDGPU::S_ORN2_B64:
103  case AMDGPU::S_NAND_B64:
104  case AMDGPU::S_NOR_B64:
105  case AMDGPU::S_XNOR_B64: {
106  const MachineOperand &Src1 = MI.getOperand(1);
107  if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC)
108  return MI.getOperand(0).getReg();
109  const MachineOperand &Src2 = MI.getOperand(2);
110  if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
111  return MI.getOperand(0).getReg();
112  break;
113  }
114  case AMDGPU::S_AND_B32:
115  case AMDGPU::S_OR_B32:
116  case AMDGPU::S_XOR_B32:
117  case AMDGPU::S_ANDN2_B32:
118  case AMDGPU::S_ORN2_B32:
119  case AMDGPU::S_NAND_B32:
120  case AMDGPU::S_NOR_B32:
121  case AMDGPU::S_XNOR_B32: {
122  const MachineOperand &Src1 = MI.getOperand(1);
123  if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO)
124  return MI.getOperand(0).getReg();
125  const MachineOperand &Src2 = MI.getOperand(2);
126  if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO)
127  return MI.getOperand(0).getReg();
128  break;
129  }
130  }
131 
132  return AMDGPU::NoRegister;
133 }
134 
135 static unsigned getSaveExecOp(unsigned Opc) {
136  switch (Opc) {
137  case AMDGPU::S_AND_B64:
138  return AMDGPU::S_AND_SAVEEXEC_B64;
139  case AMDGPU::S_OR_B64:
140  return AMDGPU::S_OR_SAVEEXEC_B64;
141  case AMDGPU::S_XOR_B64:
142  return AMDGPU::S_XOR_SAVEEXEC_B64;
143  case AMDGPU::S_ANDN2_B64:
144  return AMDGPU::S_ANDN2_SAVEEXEC_B64;
145  case AMDGPU::S_ORN2_B64:
146  return AMDGPU::S_ORN2_SAVEEXEC_B64;
147  case AMDGPU::S_NAND_B64:
148  return AMDGPU::S_NAND_SAVEEXEC_B64;
149  case AMDGPU::S_NOR_B64:
150  return AMDGPU::S_NOR_SAVEEXEC_B64;
151  case AMDGPU::S_XNOR_B64:
152  return AMDGPU::S_XNOR_SAVEEXEC_B64;
153  case AMDGPU::S_AND_B32:
154  return AMDGPU::S_AND_SAVEEXEC_B32;
155  case AMDGPU::S_OR_B32:
156  return AMDGPU::S_OR_SAVEEXEC_B32;
157  case AMDGPU::S_XOR_B32:
158  return AMDGPU::S_XOR_SAVEEXEC_B32;
159  case AMDGPU::S_ANDN2_B32:
160  return AMDGPU::S_ANDN2_SAVEEXEC_B32;
161  case AMDGPU::S_ORN2_B32:
162  return AMDGPU::S_ORN2_SAVEEXEC_B32;
163  case AMDGPU::S_NAND_B32:
164  return AMDGPU::S_NAND_SAVEEXEC_B32;
165  case AMDGPU::S_NOR_B32:
166  return AMDGPU::S_NOR_SAVEEXEC_B32;
167  case AMDGPU::S_XNOR_B32:
168  return AMDGPU::S_XNOR_SAVEEXEC_B32;
169  default:
170  return AMDGPU::INSTRUCTION_LIST_END;
171  }
172 }
173 
174 // These are only terminators to get correct spill code placement during
175 // register allocation, so turn them back into normal instructions.
177  switch (MI.getOpcode()) {
178  case AMDGPU::S_MOV_B32_term: {
179  bool RegSrc = MI.getOperand(1).isReg();
180  MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
181  return true;
182  }
183  case AMDGPU::S_MOV_B64_term: {
184  bool RegSrc = MI.getOperand(1).isReg();
185  MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
186  return true;
187  }
188  case AMDGPU::S_XOR_B64_term: {
189  // This is only a terminator to get the correct spill code placement during
190  // register allocation.
191  MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
192  return true;
193  }
194  case AMDGPU::S_XOR_B32_term: {
195  // This is only a terminator to get the correct spill code placement during
196  // register allocation.
197  MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
198  return true;
199  }
200  case AMDGPU::S_OR_B64_term: {
201  // This is only a terminator to get the correct spill code placement during
202  // register allocation.
203  MI.setDesc(TII.get(AMDGPU::S_OR_B64));
204  return true;
205  }
206  case AMDGPU::S_OR_B32_term: {
207  // This is only a terminator to get the correct spill code placement during
208  // register allocation.
209  MI.setDesc(TII.get(AMDGPU::S_OR_B32));
210  return true;
211  }
212  case AMDGPU::S_ANDN2_B64_term: {
213  // This is only a terminator to get the correct spill code placement during
214  // register allocation.
215  MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
216  return true;
217  }
218  case AMDGPU::S_ANDN2_B32_term: {
219  // This is only a terminator to get the correct spill code placement during
220  // register allocation.
221  MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
222  return true;
223  }
224  case AMDGPU::S_AND_B64_term: {
225  // This is only a terminator to get the correct spill code placement during
226  // register allocation.
227  MI.setDesc(TII.get(AMDGPU::S_AND_B64));
228  return true;
229  }
230  case AMDGPU::S_AND_B32_term: {
231  // This is only a terminator to get the correct spill code placement during
232  // register allocation.
233  MI.setDesc(TII.get(AMDGPU::S_AND_B32));
234  return true;
235  }
236  default:
237  return false;
238  }
239 }
240 
241 // Turn all pseudoterminators in the block into their equivalent non-terminator
242 // instructions. Returns the reverse iterator to the first non-terminator
243 // instruction in the block.
245  const SIInstrInfo &TII,
248 
249  bool Seen = false;
250  MachineBasicBlock::reverse_iterator FirstNonTerm = I;
251  for (; I != E; ++I) {
252  if (!I->isTerminator())
253  return Seen ? FirstNonTerm : I;
254 
255  if (removeTerminatorBit(TII, *I)) {
256  if (!Seen) {
257  FirstNonTerm = I;
258  Seen = true;
259  }
260  }
261  }
262 
263  return FirstNonTerm;
264 }
265 
267  const SIInstrInfo &TII,
268  const GCNSubtarget &ST,
271  unsigned CopyToExec) {
272  const unsigned InstLimit = 25;
273 
274  auto E = MBB.rend();
275  for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
276  Register CopyFromExec = isCopyFromExec(*I, ST);
277  if (CopyFromExec.isValid())
278  return I;
279  }
280 
281  return E;
282 }
283 
284 // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
285 // report the register as unavailable because a super-register with a lane mask
286 // is unavailable.
287 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
288  for (MachineBasicBlock *Succ : MBB.successors()) {
289  if (Succ->isLiveIn(Reg))
290  return true;
291  }
292 
293  return false;
294 }
295 
296 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
297 // the beginning of the BB is reached or Pred evaluates to true - which can be
298 // an arbitrary condition based on the current MachineInstr, for instance an
299 // target instruction. Breaks prematurely by returning nullptr if one of the
300 // registers given in NonModifiableRegs is modified by the current instruction.
301 static MachineInstr *
303  std::function<bool(MachineInstr *)> Pred,
304  ArrayRef<MCRegister> NonModifiableRegs,
305  const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) {
307  E = Origin.getParent()->rend();
308  unsigned CurrentIteration = 0;
309 
310  for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
311  if (A->isDebugInstr())
312  continue;
313 
314  if (Pred(&*A))
315  return &*A;
316 
317  for (MCRegister Reg : NonModifiableRegs) {
318  if (A->modifiesRegister(Reg, TRI))
319  return nullptr;
320  }
321 
322  ++CurrentIteration;
323  }
324 
325  return nullptr;
326 }
327 
328 
329 // Determine if a register Reg is not re-defined and still in use
330 // in the range (Stop..Start].
331 // It does so by backwards calculating liveness from the end of the BB until
332 // either Stop or the beginning of the BB is reached.
333 // After liveness is calculated, we can determine if Reg is still in use and not
334 // defined inbetween the instructions.
338  bool useLiveOuts = false,
339  bool ignoreStart = false) {
340  LivePhysRegs LR(*TRI);
341  if (useLiveOuts)
342  LR.addLiveOuts(*Stop.getParent());
343 
346 
347  if (ignoreStart)
348  ++A;
349 
350  for (; A != Stop.getParent()->rend() && A != Stop; ++A) {
351  LR.stepBackward(*A);
352  }
353 
354  return !LR.available(MRI, Reg);
355 }
356 
357 // Determine if a register Reg is not re-defined and still in use
358 // in the range (Stop..BB.end].
360  const SIRegisterInfo *TRI,
362  return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI,
363  MRI, true);
364 }
365 
366 // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
367 // by looking at an instance of a s_and_saveexec instruction. Returns a pointer
368 // to the v_cmp instruction if it is safe to replace the sequence (see the
369 // conditions in the function body). This is after register allocation, so some
370 // checks on operand dependencies need to be considered.
372  MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
374 
375  MachineInstr *VCmp = nullptr;
376 
377  Register SaveExecDest = SaveExec.getOperand(0).getReg();
378  if (!TRI->isSGPRReg(MRI, SaveExecDest))
379  return nullptr;
380 
381  MachineOperand *SaveExecSrc0 =
382  TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
383  if (!SaveExecSrc0->isReg())
384  return nullptr;
385 
386  // Try to find the last v_cmp instruction that defs the saveexec input
387  // operand without any write to Exec or the saveexec input operand inbetween.
388  VCmp = findInstrBackwards(
389  SaveExec,
390  [&](MachineInstr *Check) {
391  return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
392  Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
393  },
394  {Exec, SaveExecSrc0->getReg()}, TRI);
395 
396  if (!VCmp)
397  return nullptr;
398 
399  MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
400  assert(VCmpDest && "Should have an sdst operand!");
401 
402  // Check if any of the v_cmp source operands is written by the saveexec.
403  MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
404  if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
405  SaveExec.modifiesRegister(Src0->getReg(), TRI))
406  return nullptr;
407 
408  MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
409  if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
410  SaveExec.modifiesRegister(Src1->getReg(), TRI))
411  return nullptr;
412 
413  // Don't do the transformation if the destination operand is included in
414  // it's MBB Live-outs, meaning it's used in any of it's successors, leading
415  // to incorrect code if the v_cmp and therefore the def of
416  // the dest operand is removed.
417  if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
418  return nullptr;
419 
420  // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
421  // s_and_saveexec, skip the optimization.
422  if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI,
423  false, true) ||
424  isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI))
425  return nullptr;
426 
427  // Try to determine if there is a write to any of the VCmp
428  // operands between the saveexec and the vcmp.
429  // If yes, additional VGPR spilling might need to be inserted. In this case,
430  // it's not worth replacing the instruction sequence.
431  SmallVector<MCRegister, 2> NonDefRegs;
432  if (Src0->isReg())
433  NonDefRegs.push_back(Src0->getReg());
434 
435  if (Src1->isReg())
436  NonDefRegs.push_back(Src1->getReg());
437 
438  if (!findInstrBackwards(
439  SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
440  NonDefRegs, TRI))
441  return nullptr;
442 
443  return VCmp;
444 }
445 
446 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
447 // operands extracted from a v_cmp ..., s_and_saveexec pattern.
448 static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
449  MachineInstr &VCmp, MCRegister Exec,
450  const SIInstrInfo *TII,
451  const SIRegisterInfo *TRI,
453  const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
454 
455  if (NewOpcode == -1)
456  return false;
457 
458  MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
459  MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
460 
461  Register MoveDest = SaveExecInstr.getOperand(0).getReg();
462 
463  MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
464  if (!SaveExecInstr.uses().empty()) {
465  bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
466  unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
467  BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
468  SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
469  .addReg(Exec);
470  }
471 
472  // Omit dst as V_CMPX is implicitly writing to EXEC.
473  // Add dummy src and clamp modifiers, if needed.
474  auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
475  VCmp.getDebugLoc(), TII->get(NewOpcode));
476 
477  auto TryAddImmediateValueFromNamedOperand =
478  [&](unsigned OperandName) -> void {
479  if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
480  Builder.addImm(Mod->getImm());
481  };
482 
483  TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
484  Builder.add(*Src0);
485 
486  TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
487  Builder.add(*Src1);
488 
489  TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
490 
491  // The kill flags may no longer be correct.
492  if (Src0->isReg())
493  MRI.clearKillFlags(Src0->getReg());
494  if (Src1->isReg())
495  MRI.clearKillFlags(Src1->getReg());
496 
497  return true;
498 }
499 
500 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
501  if (skipFunction(MF.getFunction()))
502  return false;
503 
504  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
505  const SIRegisterInfo *TRI = ST.getRegisterInfo();
506  const SIInstrInfo *TII = ST.getInstrInfo();
508  MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
509 
510  // Optimize sequences emitted for control flow lowering. They are originally
511  // emitted as the separate operations because spill code may need to be
512  // inserted for the saved copy of exec.
513  //
514  // x = copy exec
515  // z = s_<op>_b64 x, y
516  // exec = copy z
517  // =>
518  // x = s_<op>_saveexec_b64 y
519  //
520 
521  bool Changed = false;
522  for (MachineBasicBlock &MBB : MF) {
525  if (I == E)
526  continue;
527 
528  // It's possible to see other terminator copies after the exec copy. This
529  // can happen if control flow pseudos had their outputs used by phis.
530  Register CopyToExec;
531 
532  unsigned SearchCount = 0;
533  const unsigned SearchLimit = 5;
534  while (I != E && SearchCount++ < SearchLimit) {
535  CopyToExec = isCopyToExec(*I, ST);
536  if (CopyToExec)
537  break;
538  ++I;
539  }
540 
541  if (!CopyToExec)
542  continue;
543 
544  // Scan backwards to find the def.
545  auto CopyToExecInst = &*I;
546  auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec);
547  if (CopyFromExecInst == E) {
548  auto PrepareExecInst = std::next(I);
549  if (PrepareExecInst == E)
550  continue;
551  // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
552  if (CopyToExecInst->getOperand(1).isKill() &&
553  isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
554  LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
555 
556  PrepareExecInst->getOperand(0).setReg(Exec);
557 
558  LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
559 
560  CopyToExecInst->eraseFromParent();
561  Changed = true;
562  }
563 
564  continue;
565  }
566 
567  if (isLiveOut(MBB, CopyToExec)) {
568  // The copied register is live out and has a second use in another block.
569  LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
570  continue;
571  }
572 
573  Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
574  MachineInstr *SaveExecInst = nullptr;
575  SmallVector<MachineInstr *, 4> OtherUseInsts;
576 
578  = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
579  J != JE; ++J) {
580  if (SaveExecInst && J->readsRegister(Exec, TRI)) {
581  LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
582  // Make sure this is inserted after any VALU ops that may have been
583  // scheduled in between.
584  SaveExecInst = nullptr;
585  break;
586  }
587 
588  bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI);
589 
590  if (J->modifiesRegister(CopyToExec, TRI)) {
591  if (SaveExecInst) {
592  LLVM_DEBUG(dbgs() << "Multiple instructions modify "
593  << printReg(CopyToExec, TRI) << '\n');
594  SaveExecInst = nullptr;
595  break;
596  }
597 
598  unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
599  if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
600  break;
601 
602  if (ReadsCopyFromExec) {
603  SaveExecInst = &*J;
604  LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
605  continue;
606  } else {
607  LLVM_DEBUG(dbgs()
608  << "Instruction does not read exec copy: " << *J << '\n');
609  break;
610  }
611  } else if (ReadsCopyFromExec && !SaveExecInst) {
612  // Make sure no other instruction is trying to use this copy, before it
613  // will be rewritten by the saveexec, i.e. hasOneUse. There may have
614  // been another use, such as an inserted spill. For example:
615  //
616  // %sgpr0_sgpr1 = COPY %exec
617  // spill %sgpr0_sgpr1
618  // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
619  //
620  LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
621  << '\n');
622  break;
623  }
624 
625  if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
626  assert(SaveExecInst != &*J);
627  OtherUseInsts.push_back(&*J);
628  }
629  }
630 
631  if (!SaveExecInst)
632  continue;
633 
634  LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
635 
636  MachineOperand &Src0 = SaveExecInst->getOperand(1);
637  MachineOperand &Src1 = SaveExecInst->getOperand(2);
638 
639  MachineOperand *OtherOp = nullptr;
640 
641  if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
642  OtherOp = &Src1;
643  } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
644  if (!SaveExecInst->isCommutable())
645  break;
646 
647  OtherOp = &Src0;
648  } else
649  llvm_unreachable("unexpected");
650 
651  CopyFromExecInst->eraseFromParent();
652 
653  auto InsPt = SaveExecInst->getIterator();
654  const DebugLoc &DL = SaveExecInst->getDebugLoc();
655 
656  BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
657  CopyFromExec)
658  .addReg(OtherOp->getReg());
659  SaveExecInst->eraseFromParent();
660 
661  CopyToExecInst->eraseFromParent();
662 
663  for (MachineInstr *OtherInst : OtherUseInsts) {
664  OtherInst->substituteRegister(CopyToExec, Exec,
665  AMDGPU::NoSubRegister, *TRI);
666  }
667 
668  Changed = true;
669  }
670 
671  // After all s_op_saveexec instructions are inserted,
672  // replace (on GFX10.3 and later)
673  // v_cmp_* SGPR, IMM, VGPR
674  // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
675  // with
676  // s_mov_b32 EXEC_SGPR_DEST, exec_lo
677  // v_cmpx_* IMM, VGPR
678  // to reduce pipeline stalls.
679  if (ST.hasGFX10_3Insts()) {
680  DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
681  const unsigned AndSaveExecOpcode =
682  ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
683 
684  for (MachineBasicBlock &MBB : MF) {
685  for (MachineInstr &MI : MBB) {
686  // Record relevant v_cmp / s_and_saveexec instruction pairs for
687  // replacement.
688  if (MI.getOpcode() != AndSaveExecOpcode)
689  continue;
690 
691  if (MachineInstr *VCmp =
693  SaveExecVCmpMapping[&MI] = VCmp;
694  }
695  }
696 
697  for (const auto &Entry : SaveExecVCmpMapping) {
698  MachineInstr *SaveExecInstr = Entry.getFirst();
699  MachineInstr *VCmpInstr = Entry.getSecond();
700 
701  if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
702  TRI, *MRI)) {
703  SaveExecInstr->eraseFromParent();
704  VCmpInstr->eraseFromParent();
705 
706  Changed = true;
707  }
708  }
709  }
710 
711  return Changed;
712 }
llvm::MachineInstr::uses
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
Definition: MachineInstr.h:667
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:104
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
operations
SI optimize exec mask operations
Definition: SIOptimizeExecMasking.cpp:49
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
fixTerminators
static MachineBasicBlock::reverse_iterator fixTerminators(const SIInstrInfo &TII, MachineBasicBlock &MBB)
Definition: SIOptimizeExecMasking.cpp:244
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
isCopyToExec
static Register isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST)
If MI is a copy to exec, return the register copied from.
Definition: SIOptimizeExecMasking.cpp:74
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::LivePhysRegs
A set of physical registers with utility functions to track liveness when walking backward/forward th...
Definition: LivePhysRegs.h:50
llvm::ilist_node_impl::getReverseIterator
reverse_self_iterator getReverseIterator()
Definition: ilist_node.h:85
optimizeVCMPSaveExecSequence
static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec, const SIInstrInfo *TII, const SIRegisterInfo *TRI, MachineRegisterInfo &MRI)
Definition: SIOptimizeExecMasking.cpp:448
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1628
isRegisterInUseAfter
static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg, const SIRegisterInfo *TRI, MachineRegisterInfo &MRI)
Definition: SIOptimizeExecMasking.cpp:359
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:103
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
isLiveOut
static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg)
Definition: SIOptimizeExecMasking.cpp:287
llvm::SIOptimizeExecMaskingID
char & SIOptimizeExecMaskingID
Definition: SIOptimizeExecMasking.cpp:53
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:666
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:501
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
Check
#define Check(C,...)
Definition: Lint.cpp:170
llvm::LivePhysRegs::addLiveOuts
void addLiveOuts(const MachineBasicBlock &MBB)
Adds all live-out registers of basic block MBB.
Definition: LivePhysRegs.cpp:232
false
Definition: StackSlotColoring.cpp:141
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::MachineBasicBlock::rend
reverse_iterator rend()
Definition: MachineBasicBlock.h:288
getSaveExecOp
static unsigned getSaveExecOp(unsigned Opc)
Definition: SIOptimizeExecMasking.cpp:135
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
removeTerminatorBit
static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI)
Definition: SIOptimizeExecMasking.cpp:176
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::MachineRegisterInfo::clearKillFlags
void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
Definition: MachineRegisterInfo.cpp:427
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:656
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:420
llvm::LivePhysRegs::stepBackward
void stepBackward(const MachineInstr &MI)
Simulates liveness when stepping backwards over an instruction(bundle).
Definition: LivePhysRegs.cpp:68
AMDGPUMCTargetDesc.h
isLogicalOpOnExec
static Register isLogicalOpOnExec(const MachineInstr &MI)
If MI is a logical operation on an exec value, return the register copied to.
Definition: SIOptimizeExecMasking.cpp:96
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::MachineInstr::isCommutable
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
Definition: MachineInstr.h:1064
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::LivePhysRegs::available
bool available(const MachineRegisterInfo &MRI, MCPhysReg Reg) const
Returns true if register Reg and no aliasing register is in the set.
Definition: LivePhysRegs.cpp:141
llvm::DenseMap
Definition: DenseMap.h:716
I
#define I(x, y, z)
Definition: MD5.cpp:58
MachineFunctionPass.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI
StandardInstrumentations SI(Debug, VerifyEach)
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::initializeSIOptimizeExecMaskingPass
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
llvm::MachineInstr::readsRegister
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
Definition: MachineInstr.h:1346
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition: MachineBasicBlock.h:365
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::MachineBasicBlock::rbegin
reverse_iterator rbegin()
Definition: MachineBasicBlock.h:282
AMDGPU.h
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:491
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
findExecCopy
static MachineBasicBlock::reverse_iterator findExecCopy(const SIInstrInfo &TII, const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I, unsigned CopyToExec)
Definition: SIOptimizeExecMasking.cpp:266
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:288
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
findInstrBackwards
static MachineInstr * findInstrBackwards(MachineInstr &Origin, std::function< bool(MachineInstr *)> Pred, ArrayRef< MCRegister > NonModifiableRegs, const SIRegisterInfo *TRI, unsigned MaxInstructions=20)
Definition: SIOptimizeExecMasking.cpp:302
llvm::MachineInstr::modifiesRegister
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
Definition: MachineInstr.h:1384
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:622
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:277
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::misexpect::clamp
uint32_t clamp(uint64_t value, uint32_t low, uint32_t hi)
Definition: MisExpect.cpp:150
llvm::LiveIntervals
Definition: LiveIntervals.h:54
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::Register::isValid
bool isValid() const
Definition: Register.h:126
N
#define N
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIOptimizeExecMasking.cpp:18
llvm::sys::path::rend
reverse_iterator rend(StringRef path)
Get reverse end iterator over path.
Definition: Path.cpp:306
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, "SI optimize exec mask operations", false, false) INITIALIZE_PASS_END(SIOptimizeExecMasking
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
Mod
Module * Mod
Definition: PassBuilderBindings.cpp:54
llvm::printReg
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Definition: TargetRegisterInfo.cpp:111
llvm::MachineInstr::eraseFromParent
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition: MachineInstr.cpp:650
llvm::MachineInstrBundleIterator
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
Definition: MachineInstrBundleIterator.h:108
isCopyFromExec
static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST)
If MI is a copy from exec, return the register copied to.
Definition: SIOptimizeExecMasking.cpp:56
InitializePasses.h
llvm::AMDGPU::getVCMPXOpFromVCMP
LLVM_READONLY int getVCMPXOpFromVCMP(uint16_t Opcode)
isRegisterInUseBetween
static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, MCRegister Reg, const SIRegisterInfo *TRI, MachineRegisterInfo &MRI, bool useLiveOuts=false, bool ignoreStart=false)
Definition: SIOptimizeExecMasking.cpp:335
findPossibleVCMPVCMPXOptimization
static MachineInstr * findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI, const SIInstrInfo *TII, MachineRegisterInfo &MRI)
Definition: SIOptimizeExecMasking.cpp:371
llvm::MCRegister
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:24
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
LivePhysRegs.h