LLVM  6.0.0svn
SIWholeQuadMode.cpp
Go to the documentation of this file.
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass adds instructions to enable whole quad mode for pixel
12 /// shaders, and whole wavefront mode for all programs.
13 ///
14 /// Whole quad mode is required for derivative computations, but it interferes
15 /// with shader side effects (stores and atomics). This pass is run on the
16 /// scheduled machine IR but before register coalescing, so that machine SSA is
17 /// available for analysis. It ensures that WQM is enabled when necessary, but
18 /// disabled around stores and atomics.
19 ///
20 /// When necessary, this pass creates a function prolog
21 ///
22 /// S_MOV_B64 LiveMask, EXEC
23 /// S_WQM_B64 EXEC, EXEC
24 ///
25 /// to enter WQM at the top of the function and surrounds blocks of Exact
26 /// instructions by
27 ///
28 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
29 /// ...
30 /// S_MOV_B64 EXEC, Tmp
31 ///
32 /// We also compute when a sequence of instructions requires Whole Wavefront
33 /// Mode (WWM) and insert instructions to save and restore it:
34 ///
35 /// S_OR_SAVEEXEC_B64 Tmp, -1
36 /// ...
37 /// S_MOV_B64 EXEC, Tmp
38 ///
39 /// In order to avoid excessive switching during sequences of Exact
40 /// instructions, the pass first analyzes which instructions must be run in WQM
41 /// (aka which instructions produce values that lead to derivative
42 /// computations).
43 ///
44 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
45 ///
46 /// There is room for improvement given better control flow analysis:
47 ///
48 /// (1) at the top level (outside of control flow statements, and as long as
49 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
50 /// the LiveMask (this is implemented for the entry block).
51 ///
52 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
53 /// consist of exact and don't-care instructions, the switch only has to
54 /// be done at the entry and exit points rather than potentially in each
55 /// block of the region.
56 ///
57 //===----------------------------------------------------------------------===//
58 
59 #include "AMDGPU.h"
60 #include "AMDGPUSubtarget.h"
61 #include "SIInstrInfo.h"
62 #include "SIMachineFunctionInfo.h"
63 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/SmallVector.h"
66 #include "llvm/ADT/StringRef.h"
78 #include "llvm/IR/CallingConv.h"
79 #include "llvm/IR/DebugLoc.h"
80 #include "llvm/MC/MCRegisterInfo.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
84 #include <cassert>
85 #include <vector>
86 
87 using namespace llvm;
88 
89 #define DEBUG_TYPE "si-wqm"
90 
91 namespace {
92 
93 enum {
94  StateWQM = 0x1,
95  StateWWM = 0x2,
96  StateExact = 0x4,
97 };
98 
99 struct PrintState {
100 public:
101  int State;
102 
103  explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108  if (PS.State & StateWQM)
109  OS << "WQM";
110  if (PS.State & StateWWM) {
111  if (PS.State & StateWQM)
112  OS << '|';
113  OS << "WWM";
114  }
115  if (PS.State & StateExact) {
116  if (PS.State & (StateWQM | StateWWM))
117  OS << '|';
118  OS << "Exact";
119  }
120 
121  return OS;
122 }
123 #endif
124 
125 struct InstrInfo {
126  char Needs = 0;
127  char Disabled = 0;
128  char OutNeeds = 0;
129 };
130 
131 struct BlockInfo {
132  char Needs = 0;
133  char InNeeds = 0;
134  char OutNeeds = 0;
135 };
136 
137 struct WorkItem {
138  MachineBasicBlock *MBB = nullptr;
139  MachineInstr *MI = nullptr;
140 
141  WorkItem() = default;
142  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
143  WorkItem(MachineInstr *MI) : MI(MI) {}
144 };
145 
146 class SIWholeQuadMode : public MachineFunctionPass {
147 private:
148  CallingConv::ID CallingConv;
149  const SIInstrInfo *TII;
150  const SIRegisterInfo *TRI;
152  LiveIntervals *LIS;
153 
156  SmallVector<MachineInstr *, 1> LiveMaskQueries;
157  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
158 
159  void printInfo();
160 
161  void markInstruction(MachineInstr &MI, char Flag,
162  std::vector<WorkItem> &Worklist);
163  void markInstructionUses(const MachineInstr &MI, char Flag,
164  std::vector<WorkItem> &Worklist);
165  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
166  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
167  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
168  char analyzeFunction(MachineFunction &MF);
169 
170  bool requiresCorrectState(const MachineInstr &MI) const;
171 
175  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
176  MachineBasicBlock::iterator Last, bool PreferLast,
177  bool SaveSCC);
178  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
179  unsigned SaveWQM, unsigned LiveMaskReg);
180  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
181  unsigned SavedWQM);
182  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
183  unsigned SaveOrig);
184  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
185  unsigned SavedOrig);
186  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
187 
188  void lowerLiveMaskQueries(unsigned LiveMaskReg);
189  void lowerCopyInstrs();
190 
191 public:
192  static char ID;
193 
194  SIWholeQuadMode() :
195  MachineFunctionPass(ID) { }
196 
197  bool runOnMachineFunction(MachineFunction &MF) override;
198 
199  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
200 
201  void getAnalysisUsage(AnalysisUsage &AU) const override {
203  AU.setPreservesCFG();
205  }
206 };
207 
208 } // end anonymous namespace
209 
210 char SIWholeQuadMode::ID = 0;
211 
212 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
213  false)
215 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
216  false)
217 
218 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
219 
221  return new SIWholeQuadMode;
222 }
223 
224 #ifndef NDEBUG
225 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
226  for (const auto &BII : Blocks) {
227  dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
228  << " InNeeds = " << PrintState(BII.second.InNeeds)
229  << ", Needs = " << PrintState(BII.second.Needs)
230  << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
231 
232  for (const MachineInstr &MI : *BII.first) {
233  auto III = Instructions.find(&MI);
234  if (III == Instructions.end())
235  continue;
236 
237  dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
238  << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
239  }
240  }
241 }
242 #endif
243 
244 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
245  std::vector<WorkItem> &Worklist) {
246  InstrInfo &II = Instructions[&MI];
247 
248  assert(!(Flag & StateExact) && Flag != 0);
249 
250  // Remove any disabled states from the flag. The user that required it gets
251  // an undefined value in the helper lanes. For example, this can happen if
252  // the result of an atomic is used by instruction that requires WQM, where
253  // ignoring the request for WQM is correct as per the relevant specs.
254  Flag &= ~II.Disabled;
255 
256  // Ignore if the flag is already encompassed by the existing needs, or we
257  // just disabled everything.
258  if ((II.Needs & Flag) == Flag)
259  return;
260 
261  II.Needs |= Flag;
262  Worklist.push_back(&MI);
263 }
264 
265 /// Mark all instructions defining the uses in \p MI with \p Flag.
266 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
267  std::vector<WorkItem> &Worklist) {
268  for (const MachineOperand &Use : MI.uses()) {
269  if (!Use.isReg() || !Use.isUse())
270  continue;
271 
272  unsigned Reg = Use.getReg();
273 
274  // Handle physical registers that we need to track; this is mostly relevant
275  // for VCC, which can appear as the (implicit) input of a uniform branch,
276  // e.g. when a loop counter is stored in a VGPR.
278  if (Reg == AMDGPU::EXEC)
279  continue;
280 
281  for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
282  LiveRange &LR = LIS->getRegUnit(*RegUnit);
283  const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
284  if (!Value)
285  continue;
286 
287  // Since we're in machine SSA, we do not need to track physical
288  // registers across basic blocks.
289  if (Value->isPHIDef())
290  continue;
291 
292  markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
293  Worklist);
294  }
295 
296  continue;
297  }
298 
299  for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
300  markInstruction(DefMI, Flag, Worklist);
301  }
302 }
303 
304 // Scan instructions to determine which ones require an Exact execmask and
305 // which ones seed WQM requirements.
306 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
307  std::vector<WorkItem> &Worklist) {
308  char GlobalFlags = 0;
309  bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
310  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
311 
312  // We need to visit the basic blocks in reverse post-order so that we visit
313  // defs before uses, in particular so that we don't accidentally mark an
314  // instruction as needing e.g. WQM before visiting it and realizing it needs
315  // WQM disabled.
317  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
318  MachineBasicBlock &MBB = **BI;
319  BlockInfo &BBI = Blocks[&MBB];
320 
321  for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
322  MachineInstr &MI = *II;
323  InstrInfo &III = Instructions[&MI];
324  unsigned Opcode = MI.getOpcode();
325  char Flags = 0;
326 
327  if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
328  Flags = StateWQM;
329  } else if (TII->isWQM(Opcode)) {
330  // Sampling instructions don't need to produce results for all pixels
331  // in a quad, they just require all inputs of a quad to have been
332  // computed for derivatives.
333  markInstructionUses(MI, StateWQM, Worklist);
334  GlobalFlags |= StateWQM;
335  continue;
336  } else if (Opcode == AMDGPU::WQM) {
337  // The WQM intrinsic requires its output to have all the helper lanes
338  // correct, so we need it to be in WQM.
339  Flags = StateWQM;
340  LowerToCopyInstrs.push_back(&MI);
341  } else if (Opcode == AMDGPU::WWM) {
342  // The WWM intrinsic doesn't make the same guarantee, and plus it needs
343  // to be executed in WQM or Exact so that its copy doesn't clobber
344  // inactive lanes.
345  markInstructionUses(MI, StateWWM, Worklist);
346  GlobalFlags |= StateWWM;
347  LowerToCopyInstrs.push_back(&MI);
348  continue;
349  } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
350  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
351  III.Disabled = StateWWM;
352  MachineOperand &Inactive = MI.getOperand(2);
353  if (Inactive.isReg()) {
354  if (Inactive.isUndef()) {
355  LowerToCopyInstrs.push_back(&MI);
356  } else {
357  unsigned Reg = Inactive.getReg();
359  for (MachineInstr &DefMI : MRI->def_instructions(Reg))
360  markInstruction(DefMI, StateWWM, Worklist);
361  }
362  }
363  }
364  SetInactiveInstrs.push_back(&MI);
365  continue;
366  } else if (TII->isDisableWQM(MI)) {
367  BBI.Needs |= StateExact;
368  if (!(BBI.InNeeds & StateExact)) {
369  BBI.InNeeds |= StateExact;
370  Worklist.push_back(&MBB);
371  }
372  GlobalFlags |= StateExact;
373  III.Disabled = StateWQM | StateWWM;
374  continue;
375  } else {
376  if (Opcode == AMDGPU::SI_PS_LIVE) {
377  LiveMaskQueries.push_back(&MI);
378  } else if (WQMOutputs) {
379  // The function is in machine SSA form, which means that physical
380  // VGPRs correspond to shader inputs and outputs. Inputs are
381  // only used, outputs are only defined.
382  for (const MachineOperand &MO : MI.defs()) {
383  if (!MO.isReg())
384  continue;
385 
386  unsigned Reg = MO.getReg();
387 
388  if (!TRI->isVirtualRegister(Reg) &&
389  TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
390  Flags = StateWQM;
391  break;
392  }
393  }
394  }
395 
396  if (!Flags)
397  continue;
398  }
399 
400  markInstruction(MI, Flags, Worklist);
401  GlobalFlags |= Flags;
402  }
403  }
404 
405  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
406  // ever used anywhere in the function. This implements the corresponding
407  // semantics of @llvm.amdgcn.set.inactive.
408  if (GlobalFlags & StateWQM) {
409  for (MachineInstr *MI : SetInactiveInstrs)
410  markInstruction(*MI, StateWQM, Worklist);
411  }
412 
413  return GlobalFlags;
414 }
415 
416 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
417  std::vector<WorkItem>& Worklist) {
418  MachineBasicBlock *MBB = MI.getParent();
419  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
420  BlockInfo &BI = Blocks[MBB];
421 
422  // Control flow-type instructions and stores to temporary memory that are
423  // followed by WQM computations must themselves be in WQM.
424  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
425  (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
426  Instructions[&MI].Needs = StateWQM;
427  II.Needs = StateWQM;
428  }
429 
430  // Propagate to block level
431  if (II.Needs & StateWQM) {
432  BI.Needs |= StateWQM;
433  if (!(BI.InNeeds & StateWQM)) {
434  BI.InNeeds |= StateWQM;
435  Worklist.push_back(MBB);
436  }
437  }
438 
439  // Propagate backwards within block
440  if (MachineInstr *PrevMI = MI.getPrevNode()) {
441  char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
442  if (!PrevMI->isPHI()) {
443  InstrInfo &PrevII = Instructions[PrevMI];
444  if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
445  PrevII.OutNeeds |= InNeeds;
446  Worklist.push_back(PrevMI);
447  }
448  }
449  }
450 
451  // Propagate WQM flag to instruction inputs
452  assert(!(II.Needs & StateExact));
453 
454  if (II.Needs != 0)
455  markInstructionUses(MI, II.Needs, Worklist);
456 }
457 
458 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
459  std::vector<WorkItem>& Worklist) {
460  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
461 
462  // Propagate through instructions
463  if (!MBB.empty()) {
464  MachineInstr *LastMI = &*MBB.rbegin();
465  InstrInfo &LastII = Instructions[LastMI];
466  if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
467  LastII.OutNeeds |= BI.OutNeeds;
468  Worklist.push_back(LastMI);
469  }
470  }
471 
472  // Predecessor blocks must provide for our WQM/Exact needs.
473  for (MachineBasicBlock *Pred : MBB.predecessors()) {
474  BlockInfo &PredBI = Blocks[Pred];
475  if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
476  continue;
477 
478  PredBI.OutNeeds |= BI.InNeeds;
479  PredBI.InNeeds |= BI.InNeeds;
480  Worklist.push_back(Pred);
481  }
482 
483  // All successors must be prepared to accept the same set of WQM/Exact data.
484  for (MachineBasicBlock *Succ : MBB.successors()) {
485  BlockInfo &SuccBI = Blocks[Succ];
486  if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
487  continue;
488 
489  SuccBI.InNeeds |= BI.OutNeeds;
490  Worklist.push_back(Succ);
491  }
492 }
493 
494 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
495  std::vector<WorkItem> Worklist;
496  char GlobalFlags = scanInstructions(MF, Worklist);
497 
498  while (!Worklist.empty()) {
499  WorkItem WI = Worklist.back();
500  Worklist.pop_back();
501 
502  if (WI.MI)
503  propagateInstruction(*WI.MI, Worklist);
504  else
505  propagateBlock(*WI.MBB, Worklist);
506  }
507 
508  return GlobalFlags;
509 }
510 
511 /// Whether \p MI really requires the exec state computed during analysis.
512 ///
513 /// Scalar instructions must occasionally be marked WQM for correct propagation
514 /// (e.g. thread masks leading up to branches), but when it comes to actual
515 /// execution, they don't care about EXEC.
516 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
517  if (MI.isTerminator())
518  return true;
519 
520  // Skip instructions that are not affected by EXEC
521  if (TII->isScalarUnit(MI))
522  return false;
523 
524  // Generic instructions such as COPY will either disappear by register
525  // coalescing or be lowered to SALU or VALU instructions.
526  if (MI.isTransient()) {
527  if (MI.getNumExplicitOperands() >= 1) {
528  const MachineOperand &Op = MI.getOperand(0);
529  if (Op.isReg()) {
530  if (TRI->isSGPRReg(*MRI, Op.getReg())) {
531  // SGPR instructions are not affected by EXEC
532  return false;
533  }
534  }
535  }
536  }
537 
538  return true;
539 }
540 
542 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
544  unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
545 
546  MachineInstr *Save =
547  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
548  .addReg(AMDGPU::SCC);
549  MachineInstr *Restore =
550  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
551  .addReg(SaveReg);
552 
553  LIS->InsertMachineInstrInMaps(*Save);
554  LIS->InsertMachineInstrInMaps(*Restore);
555  LIS->createAndComputeVirtRegInterval(SaveReg);
556 
557  return Restore;
558 }
559 
560 // Return an iterator in the (inclusive) range [First, Last] at which
561 // instructions can be safely inserted, keeping in mind that some of the
562 // instructions we want to add necessarily clobber SCC.
563 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
565  MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
566  if (!SaveSCC)
567  return PreferLast ? Last : First;
568 
569  LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
570  auto MBBE = MBB.end();
571  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
572  : LIS->getMBBEndIdx(&MBB);
573  SlotIndex LastIdx =
574  Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
575  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
576  const LiveRange::Segment *S;
577 
578  for (;;) {
579  S = LR.getSegmentContaining(Idx);
580  if (!S)
581  break;
582 
583  if (PreferLast) {
584  SlotIndex Next = S->start.getBaseIndex();
585  if (Next < FirstIdx)
586  break;
587  Idx = Next;
588  } else {
589  SlotIndex Next = S->end.getNextIndex().getBaseIndex();
590  if (Next > LastIdx)
591  break;
592  Idx = Next;
593  }
594  }
595 
597 
598  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
599  MBBI = MI;
600  else {
601  assert(Idx == LIS->getMBBEndIdx(&MBB));
602  MBBI = MBB.end();
603  }
604 
605  if (S)
606  MBBI = saveSCC(MBB, MBBI);
607 
608  return MBBI;
609 }
610 
611 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
613  unsigned SaveWQM, unsigned LiveMaskReg) {
614  MachineInstr *MI;
615 
616  if (SaveWQM) {
617  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
618  SaveWQM)
619  .addReg(LiveMaskReg);
620  } else {
621  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
622  AMDGPU::EXEC)
623  .addReg(AMDGPU::EXEC)
624  .addReg(LiveMaskReg);
625  }
626 
627  LIS->InsertMachineInstrInMaps(*MI);
628 }
629 
630 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
632  unsigned SavedWQM) {
633  MachineInstr *MI;
634 
635  if (SavedWQM) {
636  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
637  .addReg(SavedWQM);
638  } else {
639  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
640  AMDGPU::EXEC)
641  .addReg(AMDGPU::EXEC);
642  }
643 
644  LIS->InsertMachineInstrInMaps(*MI);
645 }
646 
647 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
649  unsigned SaveOrig) {
650  MachineInstr *MI;
651 
652  assert(SaveOrig);
653  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
654  SaveOrig)
655  .addImm(-1);
656  LIS->InsertMachineInstrInMaps(*MI);
657 }
658 
659 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
661  unsigned SavedOrig) {
662  MachineInstr *MI;
663 
664  assert(SavedOrig);
665  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
666  .addReg(SavedOrig);
667  LIS->InsertMachineInstrInMaps(*MI);
668 }
669 
670 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
671  bool isEntry) {
672  auto BII = Blocks.find(&MBB);
673  if (BII == Blocks.end())
674  return;
675 
676  const BlockInfo &BI = BII->second;
677 
678  // This is a non-entry block that is WQM throughout, so no need to do
679  // anything.
680  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
681  return;
682 
683  DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
684 
685  unsigned SavedWQMReg = 0;
686  unsigned SavedNonWWMReg = 0;
687  bool WQMFromExec = isEntry;
688  char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
689  char NonWWMState = 0;
690 
691  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
692  if (isEntry)
693  ++II; // Skip the instruction that saves LiveMask
694 
695  // This stores the first instruction where it's safe to switch from WQM to
696  // Exact or vice versa.
697  MachineBasicBlock::iterator FirstWQM = IE;
698 
699  // This stores the first instruction where it's safe to switch from WWM to
700  // Exact/WQM or to switch to WWM. It must always be the same as, or after,
701  // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
702  // switch to/from WQM as well.
703  MachineBasicBlock::iterator FirstWWM = IE;
704  for (;;) {
705  MachineBasicBlock::iterator Next = II;
706  char Needs = StateExact | StateWQM; // WWM is disabled by default
707  char OutNeeds = 0;
708 
709  if (FirstWQM == IE)
710  FirstWQM = II;
711 
712  if (FirstWWM == IE)
713  FirstWWM = II;
714 
715  // First, figure out the allowed states (Needs) based on the propagated
716  // flags.
717  if (II != IE) {
718  MachineInstr &MI = *II;
719 
720  if (requiresCorrectState(MI)) {
721  auto III = Instructions.find(&MI);
722  if (III != Instructions.end()) {
723  if (III->second.Needs & StateWWM)
724  Needs = StateWWM;
725  else if (III->second.Needs & StateWQM)
726  Needs = StateWQM;
727  else
728  Needs &= ~III->second.Disabled;
729  OutNeeds = III->second.OutNeeds;
730  }
731  } else {
732  // If the instruction doesn't actually need a correct EXEC, then we can
733  // safely leave WWM enabled.
734  Needs = StateExact | StateWQM | StateWWM;
735  }
736 
737  if (MI.isTerminator() && OutNeeds == StateExact)
738  Needs = StateExact;
739 
740  if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
741  MI.getOperand(3).setImm(1);
742 
743  ++Next;
744  } else {
745  // End of basic block
746  if (BI.OutNeeds & StateWQM)
747  Needs = StateWQM;
748  else if (BI.OutNeeds == StateExact)
749  Needs = StateExact;
750  else
751  Needs = StateWQM | StateExact;
752  }
753 
754  // Now, transition if necessary.
755  if (!(Needs & State)) {
757  if (State == StateWWM || Needs == StateWWM) {
758  // We must switch to or from WWM
759  First = FirstWWM;
760  } else {
761  // We only need to switch to/from WQM, so we can use FirstWQM
762  First = FirstWQM;
763  }
764 
766  prepareInsertion(MBB, First, II, Needs == StateWQM,
767  Needs == StateExact || WQMFromExec);
768 
769  if (State == StateWWM) {
770  assert(SavedNonWWMReg);
771  fromWWM(MBB, Before, SavedNonWWMReg);
772  State = NonWWMState;
773  }
774 
775  if (Needs == StateWWM) {
776  NonWWMState = State;
777  SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
778  toWWM(MBB, Before, SavedNonWWMReg);
779  State = StateWWM;
780  } else {
781  if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
782  if (!WQMFromExec && (OutNeeds & StateWQM))
783  SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
784 
785  toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
786  State = StateExact;
787  } else if (State == StateExact && (Needs & StateWQM) &&
788  !(Needs & StateExact)) {
789  assert(WQMFromExec == (SavedWQMReg == 0));
790 
791  toWQM(MBB, Before, SavedWQMReg);
792 
793  if (SavedWQMReg) {
794  LIS->createAndComputeVirtRegInterval(SavedWQMReg);
795  SavedWQMReg = 0;
796  }
797  State = StateWQM;
798  } else {
799  // We can get here if we transitioned from WWM to a non-WWM state that
800  // already matches our needs, but we shouldn't need to do anything.
801  assert(Needs & State);
802  }
803  }
804  }
805 
806  if (Needs != (StateExact | StateWQM | StateWWM)) {
807  if (Needs != (StateExact | StateWQM))
808  FirstWQM = IE;
809  FirstWWM = IE;
810  }
811 
812  if (II == IE)
813  break;
814  II = Next;
815  }
816 }
817 
818 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
819  for (MachineInstr *MI : LiveMaskQueries) {
820  const DebugLoc &DL = MI->getDebugLoc();
821  unsigned Dest = MI->getOperand(0).getReg();
822  MachineInstr *Copy =
823  BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
824  .addReg(LiveMaskReg);
825 
826  LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
827  MI->eraseFromParent();
828  }
829 }
830 
831 void SIWholeQuadMode::lowerCopyInstrs() {
832  for (MachineInstr *MI : LowerToCopyInstrs) {
833  for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
834  MI->RemoveOperand(i);
835  MI->setDesc(TII->get(AMDGPU::COPY));
836  }
837 }
838 
839 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
840  Instructions.clear();
841  Blocks.clear();
842  LiveMaskQueries.clear();
843  LowerToCopyInstrs.clear();
844  CallingConv = MF.getFunction()->getCallingConv();
845 
846  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
847 
848  TII = ST.getInstrInfo();
849  TRI = &TII->getRegisterInfo();
850  MRI = &MF.getRegInfo();
851  LIS = &getAnalysis<LiveIntervals>();
852 
853  char GlobalFlags = analyzeFunction(MF);
854  unsigned LiveMaskReg = 0;
855  if (!(GlobalFlags & StateWQM)) {
856  lowerLiveMaskQueries(AMDGPU::EXEC);
857  if (!(GlobalFlags & StateWWM))
858  return !LiveMaskQueries.empty();
859  } else {
860  // Store a copy of the original live mask when required
861  MachineBasicBlock &Entry = MF.front();
863 
864  if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
865  LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
866  MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
867  TII->get(AMDGPU::COPY), LiveMaskReg)
868  .addReg(AMDGPU::EXEC);
869  LIS->InsertMachineInstrInMaps(*MI);
870  }
871 
872  lowerLiveMaskQueries(LiveMaskReg);
873 
874  if (GlobalFlags == StateWQM) {
875  // For a shader that needs only WQM, we can just set it once.
876  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
877  AMDGPU::EXEC)
878  .addReg(AMDGPU::EXEC);
879 
880  lowerCopyInstrs();
881  // EntryMI may become invalid here
882  return true;
883  }
884  }
885 
886  DEBUG(printInfo());
887 
888  lowerCopyInstrs();
889 
890  // Handle the general case
891  for (auto BII : Blocks)
892  processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
893 
894  // Physical registers like SCC aren't tracked by default anyway, so just
895  // removing the ranges we computed is the simplest option for maintaining
896  // the analysis results.
897  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
898 
899  return true;
900 }
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
Definition: LiveInterval.h:78
void push_back(const T &Elt)
Definition: SmallVector.h:212
char & SIWholeQuadModeID
AMDGPU specific subclass of TargetSubtarget.
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:242
SlotIndex def
The index of the defining instruction.
Definition: LiveInterval.h:61
SI Whole Quad Mode
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:449
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
Definition: MachineInstr.h:365
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_END(SIWholeQuadMode
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:268
unsigned getReg() const
getReg - Returns the register number.
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
const SIInstrInfo * getInstrInfo() const override
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:262
bool isTransient() const
Return true if this is a transient instruction that is either very likely to be eliminated during reg...
Definition: MachineInstr.h:900
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:162
A debug info location.
Definition: DebugLoc.h:34
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
iterator_range< succ_iterator > successors()
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
SlotIndex getNextIndex() const
Returns the next index.
Definition: SlotIndexes.h:280
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
Reg
All possible values of the reg field in the ModR/M byte.
bool isTerminator(QueryType Type=AnyInBundle) const
Returns true if this instruction part of the terminator for a basic block.
Definition: MachineInstr.h:474
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:290
void RemoveOperand(unsigned i)
Erase an operand from an instruction, leaving it with one fewer operand than it started with...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
reverse_iterator rbegin()
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:529
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:121
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:639
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
FunctionPass * createSIWholeQuadModePass()
Represent the analysis usage information of a pass.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:354
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
iterator_range< pred_iterator > predecessors()
const MachineBasicBlock & front() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
const SIRegisterInfo * getRegisterInfo() const override
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:194
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:864
MachineInstrBuilder MachineInstrBuilder & DefMI
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:285
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
#define DEBUG_TYPE
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:139
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:59
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:2018
bool isReg() const
isReg - Tests if this is a MO_Register operand.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:396
const Function * getFunction() const
getFunction - Return the LLVM function that this machine code represents
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:73
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:44
#define DEBUG(X)
Definition: Debug.h:118
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:295
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:84