LLVM  7.0.0svn
SIWholeQuadMode.cpp
Go to the documentation of this file.
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass adds instructions to enable whole quad mode for pixel
12 /// shaders, and whole wavefront mode for all programs.
13 ///
14 /// Whole quad mode is required for derivative computations, but it interferes
15 /// with shader side effects (stores and atomics). This pass is run on the
16 /// scheduled machine IR but before register coalescing, so that machine SSA is
17 /// available for analysis. It ensures that WQM is enabled when necessary, but
18 /// disabled around stores and atomics.
19 ///
20 /// When necessary, this pass creates a function prolog
21 ///
22 /// S_MOV_B64 LiveMask, EXEC
23 /// S_WQM_B64 EXEC, EXEC
24 ///
25 /// to enter WQM at the top of the function and surrounds blocks of Exact
26 /// instructions by
27 ///
28 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
29 /// ...
30 /// S_MOV_B64 EXEC, Tmp
31 ///
32 /// We also compute when a sequence of instructions requires Whole Wavefront
33 /// Mode (WWM) and insert instructions to save and restore it:
34 ///
35 /// S_OR_SAVEEXEC_B64 Tmp, -1
36 /// ...
37 /// S_MOV_B64 EXEC, Tmp
38 ///
39 /// In order to avoid excessive switching during sequences of Exact
40 /// instructions, the pass first analyzes which instructions must be run in WQM
41 /// (aka which instructions produce values that lead to derivative
42 /// computations).
43 ///
44 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
45 ///
46 /// There is room for improvement given better control flow analysis:
47 ///
48 /// (1) at the top level (outside of control flow statements, and as long as
49 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
50 /// the LiveMask (this is implemented for the entry block).
51 ///
52 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
53 /// consist of exact and don't-care instructions, the switch only has to
54 /// be done at the entry and exit points rather than potentially in each
55 /// block of the region.
56 ///
57 //===----------------------------------------------------------------------===//
58 
59 #include "AMDGPU.h"
60 #include "AMDGPUSubtarget.h"
61 #include "SIInstrInfo.h"
62 #include "SIMachineFunctionInfo.h"
63 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/SmallVector.h"
66 #include "llvm/ADT/StringRef.h"
78 #include "llvm/IR/CallingConv.h"
79 #include "llvm/IR/DebugLoc.h"
80 #include "llvm/MC/MCRegisterInfo.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
84 #include <cassert>
85 #include <vector>
86 
87 using namespace llvm;
88 
89 #define DEBUG_TYPE "si-wqm"
90 
91 namespace {
92 
93 enum {
94  StateWQM = 0x1,
95  StateWWM = 0x2,
96  StateExact = 0x4,
97 };
98 
99 struct PrintState {
100 public:
101  int State;
102 
103  explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108  if (PS.State & StateWQM)
109  OS << "WQM";
110  if (PS.State & StateWWM) {
111  if (PS.State & StateWQM)
112  OS << '|';
113  OS << "WWM";
114  }
115  if (PS.State & StateExact) {
116  if (PS.State & (StateWQM | StateWWM))
117  OS << '|';
118  OS << "Exact";
119  }
120 
121  return OS;
122 }
123 #endif
124 
125 struct InstrInfo {
126  char Needs = 0;
127  char Disabled = 0;
128  char OutNeeds = 0;
129 };
130 
131 struct BlockInfo {
132  char Needs = 0;
133  char InNeeds = 0;
134  char OutNeeds = 0;
135 };
136 
137 struct WorkItem {
138  MachineBasicBlock *MBB = nullptr;
139  MachineInstr *MI = nullptr;
140 
141  WorkItem() = default;
142  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
143  WorkItem(MachineInstr *MI) : MI(MI) {}
144 };
145 
146 class SIWholeQuadMode : public MachineFunctionPass {
147 private:
148  CallingConv::ID CallingConv;
149  const SIInstrInfo *TII;
150  const SIRegisterInfo *TRI;
152  LiveIntervals *LIS;
153 
156  SmallVector<MachineInstr *, 1> LiveMaskQueries;
157  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
158 
159  void printInfo();
160 
161  void markInstruction(MachineInstr &MI, char Flag,
162  std::vector<WorkItem> &Worklist);
163  void markInstructionUses(const MachineInstr &MI, char Flag,
164  std::vector<WorkItem> &Worklist);
165  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
166  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
167  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
168  char analyzeFunction(MachineFunction &MF);
169 
170  bool requiresCorrectState(const MachineInstr &MI) const;
171 
175  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
176  MachineBasicBlock::iterator Last, bool PreferLast,
177  bool SaveSCC);
178  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
179  unsigned SaveWQM, unsigned LiveMaskReg);
180  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
181  unsigned SavedWQM);
182  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
183  unsigned SaveOrig);
184  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
185  unsigned SavedOrig);
186  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
187 
188  void lowerLiveMaskQueries(unsigned LiveMaskReg);
189  void lowerCopyInstrs();
190 
191 public:
192  static char ID;
193 
194  SIWholeQuadMode() :
195  MachineFunctionPass(ID) { }
196 
197  bool runOnMachineFunction(MachineFunction &MF) override;
198 
199  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
200 
201  void getAnalysisUsage(AnalysisUsage &AU) const override {
203  AU.setPreservesCFG();
205  }
206 };
207 
208 } // end anonymous namespace
209 
210 char SIWholeQuadMode::ID = 0;
211 
212 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
213  false)
215 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
216  false)
217 
218 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
219 
221  return new SIWholeQuadMode;
222 }
223 
224 #ifndef NDEBUG
225 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
226  for (const auto &BII : Blocks) {
227  dbgs() << "\n"
228  << printMBBReference(*BII.first) << ":\n"
229  << " InNeeds = " << PrintState(BII.second.InNeeds)
230  << ", Needs = " << PrintState(BII.second.Needs)
231  << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
232 
233  for (const MachineInstr &MI : *BII.first) {
234  auto III = Instructions.find(&MI);
235  if (III == Instructions.end())
236  continue;
237 
238  dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
239  << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
240  }
241  }
242 }
243 #endif
244 
245 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
246  std::vector<WorkItem> &Worklist) {
247  InstrInfo &II = Instructions[&MI];
248 
249  assert(!(Flag & StateExact) && Flag != 0);
250 
251  // Remove any disabled states from the flag. The user that required it gets
252  // an undefined value in the helper lanes. For example, this can happen if
253  // the result of an atomic is used by instruction that requires WQM, where
254  // ignoring the request for WQM is correct as per the relevant specs.
255  Flag &= ~II.Disabled;
256 
257  // Ignore if the flag is already encompassed by the existing needs, or we
258  // just disabled everything.
259  if ((II.Needs & Flag) == Flag)
260  return;
261 
262  II.Needs |= Flag;
263  Worklist.push_back(&MI);
264 }
265 
266 /// Mark all instructions defining the uses in \p MI with \p Flag.
267 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
268  std::vector<WorkItem> &Worklist) {
269  for (const MachineOperand &Use : MI.uses()) {
270  if (!Use.isReg() || !Use.isUse())
271  continue;
272 
273  unsigned Reg = Use.getReg();
274 
275  // Handle physical registers that we need to track; this is mostly relevant
276  // for VCC, which can appear as the (implicit) input of a uniform branch,
277  // e.g. when a loop counter is stored in a VGPR.
279  if (Reg == AMDGPU::EXEC)
280  continue;
281 
282  for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
283  LiveRange &LR = LIS->getRegUnit(*RegUnit);
284  const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
285  if (!Value)
286  continue;
287 
288  // Since we're in machine SSA, we do not need to track physical
289  // registers across basic blocks.
290  if (Value->isPHIDef())
291  continue;
292 
293  markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
294  Worklist);
295  }
296 
297  continue;
298  }
299 
300  for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
301  markInstruction(DefMI, Flag, Worklist);
302  }
303 }
304 
305 // Scan instructions to determine which ones require an Exact execmask and
306 // which ones seed WQM requirements.
307 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
308  std::vector<WorkItem> &Worklist) {
309  char GlobalFlags = 0;
310  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
311  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
312 
313  // We need to visit the basic blocks in reverse post-order so that we visit
314  // defs before uses, in particular so that we don't accidentally mark an
315  // instruction as needing e.g. WQM before visiting it and realizing it needs
316  // WQM disabled.
318  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
319  MachineBasicBlock &MBB = **BI;
320  BlockInfo &BBI = Blocks[&MBB];
321 
322  for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
323  MachineInstr &MI = *II;
324  InstrInfo &III = Instructions[&MI];
325  unsigned Opcode = MI.getOpcode();
326  char Flags = 0;
327 
328  if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
329  Flags = StateWQM;
330  } else if (TII->isWQM(Opcode)) {
331  // Sampling instructions don't need to produce results for all pixels
332  // in a quad, they just require all inputs of a quad to have been
333  // computed for derivatives.
334  markInstructionUses(MI, StateWQM, Worklist);
335  GlobalFlags |= StateWQM;
336  continue;
337  } else if (Opcode == AMDGPU::WQM) {
338  // The WQM intrinsic requires its output to have all the helper lanes
339  // correct, so we need it to be in WQM.
340  Flags = StateWQM;
341  LowerToCopyInstrs.push_back(&MI);
342  } else if (Opcode == AMDGPU::WWM) {
343  // The WWM intrinsic doesn't make the same guarantee, and plus it needs
344  // to be executed in WQM or Exact so that its copy doesn't clobber
345  // inactive lanes.
346  markInstructionUses(MI, StateWWM, Worklist);
347  GlobalFlags |= StateWWM;
348  LowerToCopyInstrs.push_back(&MI);
349  continue;
350  } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
351  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
352  III.Disabled = StateWWM;
353  MachineOperand &Inactive = MI.getOperand(2);
354  if (Inactive.isReg()) {
355  if (Inactive.isUndef()) {
356  LowerToCopyInstrs.push_back(&MI);
357  } else {
358  unsigned Reg = Inactive.getReg();
360  for (MachineInstr &DefMI : MRI->def_instructions(Reg))
361  markInstruction(DefMI, StateWWM, Worklist);
362  }
363  }
364  }
365  SetInactiveInstrs.push_back(&MI);
366  continue;
367  } else if (TII->isDisableWQM(MI)) {
368  BBI.Needs |= StateExact;
369  if (!(BBI.InNeeds & StateExact)) {
370  BBI.InNeeds |= StateExact;
371  Worklist.push_back(&MBB);
372  }
373  GlobalFlags |= StateExact;
374  III.Disabled = StateWQM | StateWWM;
375  continue;
376  } else {
377  if (Opcode == AMDGPU::SI_PS_LIVE) {
378  LiveMaskQueries.push_back(&MI);
379  } else if (WQMOutputs) {
380  // The function is in machine SSA form, which means that physical
381  // VGPRs correspond to shader inputs and outputs. Inputs are
382  // only used, outputs are only defined.
383  for (const MachineOperand &MO : MI.defs()) {
384  if (!MO.isReg())
385  continue;
386 
387  unsigned Reg = MO.getReg();
388 
389  if (!TRI->isVirtualRegister(Reg) &&
390  TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
391  Flags = StateWQM;
392  break;
393  }
394  }
395  }
396 
397  if (!Flags)
398  continue;
399  }
400 
401  markInstruction(MI, Flags, Worklist);
402  GlobalFlags |= Flags;
403  }
404  }
405 
406  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
407  // ever used anywhere in the function. This implements the corresponding
408  // semantics of @llvm.amdgcn.set.inactive.
409  if (GlobalFlags & StateWQM) {
410  for (MachineInstr *MI : SetInactiveInstrs)
411  markInstruction(*MI, StateWQM, Worklist);
412  }
413 
414  return GlobalFlags;
415 }
416 
417 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
418  std::vector<WorkItem>& Worklist) {
419  MachineBasicBlock *MBB = MI.getParent();
420  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
421  BlockInfo &BI = Blocks[MBB];
422 
423  // Control flow-type instructions and stores to temporary memory that are
424  // followed by WQM computations must themselves be in WQM.
425  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
426  (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
427  Instructions[&MI].Needs = StateWQM;
428  II.Needs = StateWQM;
429  }
430 
431  // Propagate to block level
432  if (II.Needs & StateWQM) {
433  BI.Needs |= StateWQM;
434  if (!(BI.InNeeds & StateWQM)) {
435  BI.InNeeds |= StateWQM;
436  Worklist.push_back(MBB);
437  }
438  }
439 
440  // Propagate backwards within block
441  if (MachineInstr *PrevMI = MI.getPrevNode()) {
442  char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
443  if (!PrevMI->isPHI()) {
444  InstrInfo &PrevII = Instructions[PrevMI];
445  if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
446  PrevII.OutNeeds |= InNeeds;
447  Worklist.push_back(PrevMI);
448  }
449  }
450  }
451 
452  // Propagate WQM flag to instruction inputs
453  assert(!(II.Needs & StateExact));
454 
455  if (II.Needs != 0)
456  markInstructionUses(MI, II.Needs, Worklist);
457 }
458 
459 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
460  std::vector<WorkItem>& Worklist) {
461  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
462 
463  // Propagate through instructions
464  if (!MBB.empty()) {
465  MachineInstr *LastMI = &*MBB.rbegin();
466  InstrInfo &LastII = Instructions[LastMI];
467  if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
468  LastII.OutNeeds |= BI.OutNeeds;
469  Worklist.push_back(LastMI);
470  }
471  }
472 
473  // Predecessor blocks must provide for our WQM/Exact needs.
474  for (MachineBasicBlock *Pred : MBB.predecessors()) {
475  BlockInfo &PredBI = Blocks[Pred];
476  if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
477  continue;
478 
479  PredBI.OutNeeds |= BI.InNeeds;
480  PredBI.InNeeds |= BI.InNeeds;
481  Worklist.push_back(Pred);
482  }
483 
484  // All successors must be prepared to accept the same set of WQM/Exact data.
485  for (MachineBasicBlock *Succ : MBB.successors()) {
486  BlockInfo &SuccBI = Blocks[Succ];
487  if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
488  continue;
489 
490  SuccBI.InNeeds |= BI.OutNeeds;
491  Worklist.push_back(Succ);
492  }
493 }
494 
495 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
496  std::vector<WorkItem> Worklist;
497  char GlobalFlags = scanInstructions(MF, Worklist);
498 
499  while (!Worklist.empty()) {
500  WorkItem WI = Worklist.back();
501  Worklist.pop_back();
502 
503  if (WI.MI)
504  propagateInstruction(*WI.MI, Worklist);
505  else
506  propagateBlock(*WI.MBB, Worklist);
507  }
508 
509  return GlobalFlags;
510 }
511 
512 /// Whether \p MI really requires the exec state computed during analysis.
513 ///
514 /// Scalar instructions must occasionally be marked WQM for correct propagation
515 /// (e.g. thread masks leading up to branches), but when it comes to actual
516 /// execution, they don't care about EXEC.
517 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
518  if (MI.isTerminator())
519  return true;
520 
521  // Skip instructions that are not affected by EXEC
522  if (TII->isScalarUnit(MI))
523  return false;
524 
525  // Generic instructions such as COPY will either disappear by register
526  // coalescing or be lowered to SALU or VALU instructions.
527  if (MI.isTransient()) {
528  if (MI.getNumExplicitOperands() >= 1) {
529  const MachineOperand &Op = MI.getOperand(0);
530  if (Op.isReg()) {
531  if (TRI->isSGPRReg(*MRI, Op.getReg())) {
532  // SGPR instructions are not affected by EXEC
533  return false;
534  }
535  }
536  }
537  }
538 
539  return true;
540 }
541 
543 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
545  unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
546 
547  MachineInstr *Save =
548  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
549  .addReg(AMDGPU::SCC);
550  MachineInstr *Restore =
551  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
552  .addReg(SaveReg);
553 
554  LIS->InsertMachineInstrInMaps(*Save);
555  LIS->InsertMachineInstrInMaps(*Restore);
556  LIS->createAndComputeVirtRegInterval(SaveReg);
557 
558  return Restore;
559 }
560 
561 // Return an iterator in the (inclusive) range [First, Last] at which
562 // instructions can be safely inserted, keeping in mind that some of the
563 // instructions we want to add necessarily clobber SCC.
564 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
566  MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
567  if (!SaveSCC)
568  return PreferLast ? Last : First;
569 
570  LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
571  auto MBBE = MBB.end();
572  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
573  : LIS->getMBBEndIdx(&MBB);
574  SlotIndex LastIdx =
575  Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
576  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
577  const LiveRange::Segment *S;
578 
579  for (;;) {
580  S = LR.getSegmentContaining(Idx);
581  if (!S)
582  break;
583 
584  if (PreferLast) {
585  SlotIndex Next = S->start.getBaseIndex();
586  if (Next < FirstIdx)
587  break;
588  Idx = Next;
589  } else {
590  SlotIndex Next = S->end.getNextIndex().getBaseIndex();
591  if (Next > LastIdx)
592  break;
593  Idx = Next;
594  }
595  }
596 
598 
599  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
600  MBBI = MI;
601  else {
602  assert(Idx == LIS->getMBBEndIdx(&MBB));
603  MBBI = MBB.end();
604  }
605 
606  if (S)
607  MBBI = saveSCC(MBB, MBBI);
608 
609  return MBBI;
610 }
611 
612 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
614  unsigned SaveWQM, unsigned LiveMaskReg) {
615  MachineInstr *MI;
616 
617  if (SaveWQM) {
618  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
619  SaveWQM)
620  .addReg(LiveMaskReg);
621  } else {
622  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
623  AMDGPU::EXEC)
624  .addReg(AMDGPU::EXEC)
625  .addReg(LiveMaskReg);
626  }
627 
628  LIS->InsertMachineInstrInMaps(*MI);
629 }
630 
631 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
633  unsigned SavedWQM) {
634  MachineInstr *MI;
635 
636  if (SavedWQM) {
637  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
638  .addReg(SavedWQM);
639  } else {
640  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
641  AMDGPU::EXEC)
642  .addReg(AMDGPU::EXEC);
643  }
644 
645  LIS->InsertMachineInstrInMaps(*MI);
646 }
647 
648 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
650  unsigned SaveOrig) {
651  MachineInstr *MI;
652 
653  assert(SaveOrig);
654  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
655  SaveOrig)
656  .addImm(-1);
657  LIS->InsertMachineInstrInMaps(*MI);
658 }
659 
660 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
662  unsigned SavedOrig) {
663  MachineInstr *MI;
664 
665  assert(SavedOrig);
666  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
667  .addReg(SavedOrig);
668  LIS->InsertMachineInstrInMaps(*MI);
669 }
670 
671 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
672  bool isEntry) {
673  auto BII = Blocks.find(&MBB);
674  if (BII == Blocks.end())
675  return;
676 
677  const BlockInfo &BI = BII->second;
678 
679  // This is a non-entry block that is WQM throughout, so no need to do
680  // anything.
681  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
682  return;
683 
684  DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n");
685 
686  unsigned SavedWQMReg = 0;
687  unsigned SavedNonWWMReg = 0;
688  bool WQMFromExec = isEntry;
689  char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
690  char NonWWMState = 0;
691 
692  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
693  if (isEntry)
694  ++II; // Skip the instruction that saves LiveMask
695 
696  // This stores the first instruction where it's safe to switch from WQM to
697  // Exact or vice versa.
698  MachineBasicBlock::iterator FirstWQM = IE;
699 
700  // This stores the first instruction where it's safe to switch from WWM to
701  // Exact/WQM or to switch to WWM. It must always be the same as, or after,
702  // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
703  // switch to/from WQM as well.
704  MachineBasicBlock::iterator FirstWWM = IE;
705  for (;;) {
706  MachineBasicBlock::iterator Next = II;
707  char Needs = StateExact | StateWQM; // WWM is disabled by default
708  char OutNeeds = 0;
709 
710  if (FirstWQM == IE)
711  FirstWQM = II;
712 
713  if (FirstWWM == IE)
714  FirstWWM = II;
715 
716  // First, figure out the allowed states (Needs) based on the propagated
717  // flags.
718  if (II != IE) {
719  MachineInstr &MI = *II;
720 
721  if (requiresCorrectState(MI)) {
722  auto III = Instructions.find(&MI);
723  if (III != Instructions.end()) {
724  if (III->second.Needs & StateWWM)
725  Needs = StateWWM;
726  else if (III->second.Needs & StateWQM)
727  Needs = StateWQM;
728  else
729  Needs &= ~III->second.Disabled;
730  OutNeeds = III->second.OutNeeds;
731  }
732  } else {
733  // If the instruction doesn't actually need a correct EXEC, then we can
734  // safely leave WWM enabled.
735  Needs = StateExact | StateWQM | StateWWM;
736  }
737 
738  if (MI.isTerminator() && OutNeeds == StateExact)
739  Needs = StateExact;
740 
741  if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
742  MI.getOperand(3).setImm(1);
743 
744  ++Next;
745  } else {
746  // End of basic block
747  if (BI.OutNeeds & StateWQM)
748  Needs = StateWQM;
749  else if (BI.OutNeeds == StateExact)
750  Needs = StateExact;
751  else
752  Needs = StateWQM | StateExact;
753  }
754 
755  // Now, transition if necessary.
756  if (!(Needs & State)) {
758  if (State == StateWWM || Needs == StateWWM) {
759  // We must switch to or from WWM
760  First = FirstWWM;
761  } else {
762  // We only need to switch to/from WQM, so we can use FirstWQM
763  First = FirstWQM;
764  }
765 
767  prepareInsertion(MBB, First, II, Needs == StateWQM,
768  Needs == StateExact || WQMFromExec);
769 
770  if (State == StateWWM) {
771  assert(SavedNonWWMReg);
772  fromWWM(MBB, Before, SavedNonWWMReg);
773  State = NonWWMState;
774  }
775 
776  if (Needs == StateWWM) {
777  NonWWMState = State;
778  SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
779  toWWM(MBB, Before, SavedNonWWMReg);
780  State = StateWWM;
781  } else {
782  if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
783  if (!WQMFromExec && (OutNeeds & StateWQM))
784  SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
785 
786  toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
787  State = StateExact;
788  } else if (State == StateExact && (Needs & StateWQM) &&
789  !(Needs & StateExact)) {
790  assert(WQMFromExec == (SavedWQMReg == 0));
791 
792  toWQM(MBB, Before, SavedWQMReg);
793 
794  if (SavedWQMReg) {
795  LIS->createAndComputeVirtRegInterval(SavedWQMReg);
796  SavedWQMReg = 0;
797  }
798  State = StateWQM;
799  } else {
800  // We can get here if we transitioned from WWM to a non-WWM state that
801  // already matches our needs, but we shouldn't need to do anything.
802  assert(Needs & State);
803  }
804  }
805  }
806 
807  if (Needs != (StateExact | StateWQM | StateWWM)) {
808  if (Needs != (StateExact | StateWQM))
809  FirstWQM = IE;
810  FirstWWM = IE;
811  }
812 
813  if (II == IE)
814  break;
815  II = Next;
816  }
817 }
818 
819 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
820  for (MachineInstr *MI : LiveMaskQueries) {
821  const DebugLoc &DL = MI->getDebugLoc();
822  unsigned Dest = MI->getOperand(0).getReg();
823  MachineInstr *Copy =
824  BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
825  .addReg(LiveMaskReg);
826 
827  LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
828  MI->eraseFromParent();
829  }
830 }
831 
832 void SIWholeQuadMode::lowerCopyInstrs() {
833  for (MachineInstr *MI : LowerToCopyInstrs) {
834  for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
835  MI->RemoveOperand(i);
836  MI->setDesc(TII->get(AMDGPU::COPY));
837  }
838 }
839 
840 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
841  Instructions.clear();
842  Blocks.clear();
843  LiveMaskQueries.clear();
844  LowerToCopyInstrs.clear();
845  CallingConv = MF.getFunction().getCallingConv();
846 
847  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
848 
849  TII = ST.getInstrInfo();
850  TRI = &TII->getRegisterInfo();
851  MRI = &MF.getRegInfo();
852  LIS = &getAnalysis<LiveIntervals>();
853 
854  char GlobalFlags = analyzeFunction(MF);
855  unsigned LiveMaskReg = 0;
856  if (!(GlobalFlags & StateWQM)) {
857  lowerLiveMaskQueries(AMDGPU::EXEC);
858  if (!(GlobalFlags & StateWWM))
859  return !LiveMaskQueries.empty();
860  } else {
861  // Store a copy of the original live mask when required
862  MachineBasicBlock &Entry = MF.front();
864 
865  if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
866  LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
867  MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
868  TII->get(AMDGPU::COPY), LiveMaskReg)
869  .addReg(AMDGPU::EXEC);
870  LIS->InsertMachineInstrInMaps(*MI);
871  }
872 
873  lowerLiveMaskQueries(LiveMaskReg);
874 
875  if (GlobalFlags == StateWQM) {
876  // For a shader that needs only WQM, we can just set it once.
877  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
878  AMDGPU::EXEC)
879  .addReg(AMDGPU::EXEC);
880 
881  lowerCopyInstrs();
882  // EntryMI may become invalid here
883  return true;
884  }
885  }
886 
887  DEBUG(printInfo());
888 
889  lowerCopyInstrs();
890 
891  // Handle the general case
892  for (auto BII : Blocks)
893  processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
894 
895  // Physical registers like SCC aren't tracked by default anyway, so just
896  // removing the ranges we computed is the simplest option for maintaining
897  // the analysis results.
898  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
899 
900  return true;
901 }
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
Definition: LiveInterval.h:78
void push_back(const T &Elt)
Definition: SmallVector.h:212
char & SIWholeQuadModeID
AMDGPU specific subclass of TargetSubtarget.
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:242
SlotIndex def
The index of the defining instruction.
Definition: LiveInterval.h:61
SI Whole Quad Mode
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:449
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
Definition: MachineInstr.h:368
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_END(SIWholeQuadMode
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:271
unsigned getReg() const
getReg - Returns the register number.
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
const SIInstrInfo * getInstrInfo() const override
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:270
bool isTransient() const
Return true if this is a transient instruction that is either very likely to be eliminated during reg...
Definition: MachineInstr.h:903
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:162
A debug info location.
Definition: DebugLoc.h:34
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
iterator_range< succ_iterator > successors()
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
SlotIndex getNextIndex() const
Returns the next index.
Definition: SlotIndexes.h:280
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
Reg
All possible values of the reg field in the ModR/M byte.
bool isTerminator(QueryType Type=AnyInBundle) const
Returns true if this instruction part of the terminator for a basic block.
Definition: MachineInstr.h:477
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:293
void RemoveOperand(unsigned i)
Erase an operand from an instruction, leaving it with one fewer operand than it started with...
reverse_iterator rbegin()
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:529
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:121
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:642
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
FunctionPass * createSIWholeQuadModePass()
Represent the analysis usage information of a pass.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:357
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
iterator_range< pred_iterator > predecessors()
const MachineBasicBlock & front() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
const SIRegisterInfo * getRegisterInfo() const override
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:194
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:862
MachineInstrBuilder MachineInstrBuilder & DefMI
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:285
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
#define DEBUG_TYPE
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:142
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:60
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:2018
bool isReg() const
isReg - Tests if this is a MO_Register operand.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:396
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:73
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:44
#define DEBUG(X)
Definition: Debug.h:118
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:298
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:84