LLVM  10.0.0svn
SIWholeQuadMode.cpp
Go to the documentation of this file.
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode for pixel
11 /// shaders, and whole wavefront mode for all programs.
12 ///
13 /// Whole quad mode is required for derivative computations, but it interferes
14 /// with shader side effects (stores and atomics). This pass is run on the
15 /// scheduled machine IR but before register coalescing, so that machine SSA is
16 /// available for analysis. It ensures that WQM is enabled when necessary, but
17 /// disabled around stores and atomics.
18 ///
19 /// When necessary, this pass creates a function prolog
20 ///
21 /// S_MOV_B64 LiveMask, EXEC
22 /// S_WQM_B64 EXEC, EXEC
23 ///
24 /// to enter WQM at the top of the function and surrounds blocks of Exact
25 /// instructions by
26 ///
27 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
28 /// ...
29 /// S_MOV_B64 EXEC, Tmp
30 ///
31 /// We also compute when a sequence of instructions requires Whole Wavefront
32 /// Mode (WWM) and insert instructions to save and restore it:
33 ///
34 /// S_OR_SAVEEXEC_B64 Tmp, -1
35 /// ...
36 /// S_MOV_B64 EXEC, Tmp
37 ///
38 /// In order to avoid excessive switching during sequences of Exact
39 /// instructions, the pass first analyzes which instructions must be run in WQM
40 /// (aka which instructions produce values that lead to derivative
41 /// computations).
42 ///
43 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
44 ///
45 /// There is room for improvement given better control flow analysis:
46 ///
47 /// (1) at the top level (outside of control flow statements, and as long as
48 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
49 /// the LiveMask (this is implemented for the entry block).
50 ///
51 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
52 /// consist of exact and don't-care instructions, the switch only has to
53 /// be done at the entry and exit points rather than potentially in each
54 /// block of the region.
55 ///
56 //===----------------------------------------------------------------------===//
57 
58 #include "AMDGPU.h"
59 #include "AMDGPUSubtarget.h"
60 #include "SIInstrInfo.h"
61 #include "SIMachineFunctionInfo.h"
63 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/SmallVector.h"
66 #include "llvm/ADT/StringRef.h"
78 #include "llvm/IR/CallingConv.h"
79 #include "llvm/IR/DebugLoc.h"
80 #include "llvm/MC/MCRegisterInfo.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
84 #include <cassert>
85 #include <vector>
86 
87 using namespace llvm;
88 
89 #define DEBUG_TYPE "si-wqm"
90 
91 namespace {
92 
93 enum {
94  StateWQM = 0x1,
95  StateWWM = 0x2,
96  StateExact = 0x4,
97 };
98 
99 struct PrintState {
100 public:
101  int State;
102 
103  explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108  if (PS.State & StateWQM)
109  OS << "WQM";
110  if (PS.State & StateWWM) {
111  if (PS.State & StateWQM)
112  OS << '|';
113  OS << "WWM";
114  }
115  if (PS.State & StateExact) {
116  if (PS.State & (StateWQM | StateWWM))
117  OS << '|';
118  OS << "Exact";
119  }
120 
121  return OS;
122 }
123 #endif
124 
125 struct InstrInfo {
126  char Needs = 0;
127  char Disabled = 0;
128  char OutNeeds = 0;
129 };
130 
131 struct BlockInfo {
132  char Needs = 0;
133  char InNeeds = 0;
134  char OutNeeds = 0;
135 };
136 
137 struct WorkItem {
138  MachineBasicBlock *MBB = nullptr;
139  MachineInstr *MI = nullptr;
140 
141  WorkItem() = default;
142  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
143  WorkItem(MachineInstr *MI) : MI(MI) {}
144 };
145 
146 class SIWholeQuadMode : public MachineFunctionPass {
147 private:
149  const SIInstrInfo *TII;
150  const SIRegisterInfo *TRI;
151  const GCNSubtarget *ST;
153  LiveIntervals *LIS;
154 
157  SmallVector<MachineInstr *, 1> LiveMaskQueries;
158  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
159 
160  void printInfo();
161 
162  void markInstruction(MachineInstr &MI, char Flag,
163  std::vector<WorkItem> &Worklist);
164  void markInstructionUses(const MachineInstr &MI, char Flag,
165  std::vector<WorkItem> &Worklist);
166  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
167  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
168  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
169  char analyzeFunction(MachineFunction &MF);
170 
171  bool requiresCorrectState(const MachineInstr &MI) const;
172 
176  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
177  MachineBasicBlock::iterator Last, bool PreferLast,
178  bool SaveSCC);
179  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
180  unsigned SaveWQM, unsigned LiveMaskReg);
181  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
182  unsigned SavedWQM);
183  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
184  unsigned SaveOrig);
185  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
186  unsigned SavedOrig);
187  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
188 
189  void lowerLiveMaskQueries(unsigned LiveMaskReg);
190  void lowerCopyInstrs();
191 
192 public:
193  static char ID;
194 
195  SIWholeQuadMode() :
196  MachineFunctionPass(ID) { }
197 
198  bool runOnMachineFunction(MachineFunction &MF) override;
199 
200  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
201 
202  void getAnalysisUsage(AnalysisUsage &AU) const override {
206  AU.setPreservesCFG();
208  }
209 };
210 
211 } // end anonymous namespace
212 
213 char SIWholeQuadMode::ID = 0;
214 
215 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
216  false)
218 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
219  false)
220 
221 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
222 
224  return new SIWholeQuadMode;
225 }
226 
227 #ifndef NDEBUG
228 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
229  for (const auto &BII : Blocks) {
230  dbgs() << "\n"
231  << printMBBReference(*BII.first) << ":\n"
232  << " InNeeds = " << PrintState(BII.second.InNeeds)
233  << ", Needs = " << PrintState(BII.second.Needs)
234  << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
235 
236  for (const MachineInstr &MI : *BII.first) {
237  auto III = Instructions.find(&MI);
238  if (III == Instructions.end())
239  continue;
240 
241  dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
242  << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
243  }
244  }
245 }
246 #endif
247 
248 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
249  std::vector<WorkItem> &Worklist) {
250  InstrInfo &II = Instructions[&MI];
251 
252  assert(!(Flag & StateExact) && Flag != 0);
253 
254  // Remove any disabled states from the flag. The user that required it gets
255  // an undefined value in the helper lanes. For example, this can happen if
256  // the result of an atomic is used by instruction that requires WQM, where
257  // ignoring the request for WQM is correct as per the relevant specs.
258  Flag &= ~II.Disabled;
259 
260  // Ignore if the flag is already encompassed by the existing needs, or we
261  // just disabled everything.
262  if ((II.Needs & Flag) == Flag)
263  return;
264 
265  II.Needs |= Flag;
266  Worklist.push_back(&MI);
267 }
268 
269 /// Mark all instructions defining the uses in \p MI with \p Flag.
270 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
271  std::vector<WorkItem> &Worklist) {
272  for (const MachineOperand &Use : MI.uses()) {
273  if (!Use.isReg() || !Use.isUse())
274  continue;
275 
276  Register Reg = Use.getReg();
277 
278  // Handle physical registers that we need to track; this is mostly relevant
279  // for VCC, which can appear as the (implicit) input of a uniform branch,
280  // e.g. when a loop counter is stored in a VGPR.
281  if (!Register::isVirtualRegister(Reg)) {
282  if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
283  continue;
284 
285  for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
286  LiveRange &LR = LIS->getRegUnit(*RegUnit);
287  const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
288  if (!Value)
289  continue;
290 
291  // Since we're in machine SSA, we do not need to track physical
292  // registers across basic blocks.
293  if (Value->isPHIDef())
294  continue;
295 
296  markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
297  Worklist);
298  }
299 
300  continue;
301  }
302 
303  for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
304  markInstruction(DefMI, Flag, Worklist);
305  }
306 }
307 
308 // Scan instructions to determine which ones require an Exact execmask and
309 // which ones seed WQM requirements.
310 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
311  std::vector<WorkItem> &Worklist) {
312  char GlobalFlags = 0;
313  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
314  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
315  SmallVector<MachineInstr *, 4> SoftWQMInstrs;
316 
317  // We need to visit the basic blocks in reverse post-order so that we visit
318  // defs before uses, in particular so that we don't accidentally mark an
319  // instruction as needing e.g. WQM before visiting it and realizing it needs
320  // WQM disabled.
322  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
323  MachineBasicBlock &MBB = **BI;
324  BlockInfo &BBI = Blocks[&MBB];
325 
326  for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
327  MachineInstr &MI = *II;
328  InstrInfo &III = Instructions[&MI];
329  unsigned Opcode = MI.getOpcode();
330  char Flags = 0;
331 
332  if (TII->isWQM(Opcode)) {
333  // Sampling instructions don't need to produce results for all pixels
334  // in a quad, they just require all inputs of a quad to have been
335  // computed for derivatives.
336  markInstructionUses(MI, StateWQM, Worklist);
337  GlobalFlags |= StateWQM;
338  continue;
339  } else if (Opcode == AMDGPU::WQM) {
340  // The WQM intrinsic requires its output to have all the helper lanes
341  // correct, so we need it to be in WQM.
342  Flags = StateWQM;
343  LowerToCopyInstrs.push_back(&MI);
344  } else if (Opcode == AMDGPU::SOFT_WQM) {
345  LowerToCopyInstrs.push_back(&MI);
346  SoftWQMInstrs.push_back(&MI);
347  continue;
348  } else if (Opcode == AMDGPU::WWM) {
349  // The WWM intrinsic doesn't make the same guarantee, and plus it needs
350  // to be executed in WQM or Exact so that its copy doesn't clobber
351  // inactive lanes.
352  markInstructionUses(MI, StateWWM, Worklist);
353  GlobalFlags |= StateWWM;
354  LowerToCopyInstrs.push_back(&MI);
355  continue;
356  } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
357  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
358  III.Disabled = StateWWM;
359  MachineOperand &Inactive = MI.getOperand(2);
360  if (Inactive.isReg()) {
361  if (Inactive.isUndef()) {
362  LowerToCopyInstrs.push_back(&MI);
363  } else {
364  Register Reg = Inactive.getReg();
365  if (Register::isVirtualRegister(Reg)) {
366  for (MachineInstr &DefMI : MRI->def_instructions(Reg))
367  markInstruction(DefMI, StateWWM, Worklist);
368  }
369  }
370  }
371  SetInactiveInstrs.push_back(&MI);
372  continue;
373  } else if (TII->isDisableWQM(MI)) {
374  BBI.Needs |= StateExact;
375  if (!(BBI.InNeeds & StateExact)) {
376  BBI.InNeeds |= StateExact;
377  Worklist.push_back(&MBB);
378  }
379  GlobalFlags |= StateExact;
380  III.Disabled = StateWQM | StateWWM;
381  continue;
382  } else {
383  if (Opcode == AMDGPU::SI_PS_LIVE) {
384  LiveMaskQueries.push_back(&MI);
385  } else if (WQMOutputs) {
386  // The function is in machine SSA form, which means that physical
387  // VGPRs correspond to shader inputs and outputs. Inputs are
388  // only used, outputs are only defined.
389  for (const MachineOperand &MO : MI.defs()) {
390  if (!MO.isReg())
391  continue;
392 
393  Register Reg = MO.getReg();
394 
395  if (!Register::isVirtualRegister(Reg) &&
396  TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
397  Flags = StateWQM;
398  break;
399  }
400  }
401  }
402 
403  if (!Flags)
404  continue;
405  }
406 
407  markInstruction(MI, Flags, Worklist);
408  GlobalFlags |= Flags;
409  }
410  }
411 
412  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
413  // ever used anywhere in the function. This implements the corresponding
414  // semantics of @llvm.amdgcn.set.inactive.
415  // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
416  if (GlobalFlags & StateWQM) {
417  for (MachineInstr *MI : SetInactiveInstrs)
418  markInstruction(*MI, StateWQM, Worklist);
419  for (MachineInstr *MI : SoftWQMInstrs)
420  markInstruction(*MI, StateWQM, Worklist);
421  }
422 
423  return GlobalFlags;
424 }
425 
426 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
427  std::vector<WorkItem>& Worklist) {
428  MachineBasicBlock *MBB = MI.getParent();
429  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
430  BlockInfo &BI = Blocks[MBB];
431 
432  // Control flow-type instructions and stores to temporary memory that are
433  // followed by WQM computations must themselves be in WQM.
434  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
435  (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
436  Instructions[&MI].Needs = StateWQM;
437  II.Needs = StateWQM;
438  }
439 
440  // Propagate to block level
441  if (II.Needs & StateWQM) {
442  BI.Needs |= StateWQM;
443  if (!(BI.InNeeds & StateWQM)) {
444  BI.InNeeds |= StateWQM;
445  Worklist.push_back(MBB);
446  }
447  }
448 
449  // Propagate backwards within block
450  if (MachineInstr *PrevMI = MI.getPrevNode()) {
451  char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
452  if (!PrevMI->isPHI()) {
453  InstrInfo &PrevII = Instructions[PrevMI];
454  if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
455  PrevII.OutNeeds |= InNeeds;
456  Worklist.push_back(PrevMI);
457  }
458  }
459  }
460 
461  // Propagate WQM flag to instruction inputs
462  assert(!(II.Needs & StateExact));
463 
464  if (II.Needs != 0)
465  markInstructionUses(MI, II.Needs, Worklist);
466 
467  // Ensure we process a block containing WWM, even if it does not require any
468  // WQM transitions.
469  if (II.Needs & StateWWM)
470  BI.Needs |= StateWWM;
471 }
472 
473 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
474  std::vector<WorkItem>& Worklist) {
475  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
476 
477  // Propagate through instructions
478  if (!MBB.empty()) {
479  MachineInstr *LastMI = &*MBB.rbegin();
480  InstrInfo &LastII = Instructions[LastMI];
481  if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
482  LastII.OutNeeds |= BI.OutNeeds;
483  Worklist.push_back(LastMI);
484  }
485  }
486 
487  // Predecessor blocks must provide for our WQM/Exact needs.
488  for (MachineBasicBlock *Pred : MBB.predecessors()) {
489  BlockInfo &PredBI = Blocks[Pred];
490  if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
491  continue;
492 
493  PredBI.OutNeeds |= BI.InNeeds;
494  PredBI.InNeeds |= BI.InNeeds;
495  Worklist.push_back(Pred);
496  }
497 
498  // All successors must be prepared to accept the same set of WQM/Exact data.
499  for (MachineBasicBlock *Succ : MBB.successors()) {
500  BlockInfo &SuccBI = Blocks[Succ];
501  if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
502  continue;
503 
504  SuccBI.InNeeds |= BI.OutNeeds;
505  Worklist.push_back(Succ);
506  }
507 }
508 
509 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
510  std::vector<WorkItem> Worklist;
511  char GlobalFlags = scanInstructions(MF, Worklist);
512 
513  while (!Worklist.empty()) {
514  WorkItem WI = Worklist.back();
515  Worklist.pop_back();
516 
517  if (WI.MI)
518  propagateInstruction(*WI.MI, Worklist);
519  else
520  propagateBlock(*WI.MBB, Worklist);
521  }
522 
523  return GlobalFlags;
524 }
525 
526 /// Whether \p MI really requires the exec state computed during analysis.
527 ///
528 /// Scalar instructions must occasionally be marked WQM for correct propagation
529 /// (e.g. thread masks leading up to branches), but when it comes to actual
530 /// execution, they don't care about EXEC.
531 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
532  if (MI.isTerminator())
533  return true;
534 
535  // Skip instructions that are not affected by EXEC
536  if (TII->isScalarUnit(MI))
537  return false;
538 
539  // Generic instructions such as COPY will either disappear by register
540  // coalescing or be lowered to SALU or VALU instructions.
541  if (MI.isTransient()) {
542  if (MI.getNumExplicitOperands() >= 1) {
543  const MachineOperand &Op = MI.getOperand(0);
544  if (Op.isReg()) {
545  if (TRI->isSGPRReg(*MRI, Op.getReg())) {
546  // SGPR instructions are not affected by EXEC
547  return false;
548  }
549  }
550  }
551  }
552 
553  return true;
554 }
555 
557 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
559  Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
560 
561  MachineInstr *Save =
562  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
563  .addReg(AMDGPU::SCC);
564  MachineInstr *Restore =
565  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
566  .addReg(SaveReg);
567 
568  LIS->InsertMachineInstrInMaps(*Save);
569  LIS->InsertMachineInstrInMaps(*Restore);
570  LIS->createAndComputeVirtRegInterval(SaveReg);
571 
572  return Restore;
573 }
574 
575 // Return an iterator in the (inclusive) range [First, Last] at which
576 // instructions can be safely inserted, keeping in mind that some of the
577 // instructions we want to add necessarily clobber SCC.
578 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
580  MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
581  if (!SaveSCC)
582  return PreferLast ? Last : First;
583 
584  LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
585  auto MBBE = MBB.end();
586  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
587  : LIS->getMBBEndIdx(&MBB);
588  SlotIndex LastIdx =
589  Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
590  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
591  const LiveRange::Segment *S;
592 
593  for (;;) {
594  S = LR.getSegmentContaining(Idx);
595  if (!S)
596  break;
597 
598  if (PreferLast) {
599  SlotIndex Next = S->start.getBaseIndex();
600  if (Next < FirstIdx)
601  break;
602  Idx = Next;
603  } else {
604  SlotIndex Next = S->end.getNextIndex().getBaseIndex();
605  if (Next > LastIdx)
606  break;
607  Idx = Next;
608  }
609  }
610 
612 
613  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
614  MBBI = MI;
615  else {
616  assert(Idx == LIS->getMBBEndIdx(&MBB));
617  MBBI = MBB.end();
618  }
619 
620  if (S)
621  MBBI = saveSCC(MBB, MBBI);
622 
623  return MBBI;
624 }
625 
626 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
628  unsigned SaveWQM, unsigned LiveMaskReg) {
629  MachineInstr *MI;
630 
631  if (SaveWQM) {
632  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
633  AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
634  SaveWQM)
635  .addReg(LiveMaskReg);
636  } else {
637  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
638  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
639  AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
640  Exec)
641  .addReg(Exec)
642  .addReg(LiveMaskReg);
643  }
644 
645  LIS->InsertMachineInstrInMaps(*MI);
646 }
647 
648 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
650  unsigned SavedWQM) {
651  MachineInstr *MI;
652 
653  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
654  if (SavedWQM) {
655  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
656  .addReg(SavedWQM);
657  } else {
658  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
659  AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
660  Exec)
661  .addReg(Exec);
662  }
663 
664  LIS->InsertMachineInstrInMaps(*MI);
665 }
666 
667 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
669  unsigned SaveOrig) {
670  MachineInstr *MI;
671 
672  assert(SaveOrig);
673  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
674  .addImm(-1);
675  LIS->InsertMachineInstrInMaps(*MI);
676 }
677 
678 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
680  unsigned SavedOrig) {
681  MachineInstr *MI;
682 
683  assert(SavedOrig);
684  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
685  ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
686  .addReg(SavedOrig);
687  LIS->InsertMachineInstrInMaps(*MI);
688 }
689 
690 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
691  bool isEntry) {
692  auto BII = Blocks.find(&MBB);
693  if (BII == Blocks.end())
694  return;
695 
696  const BlockInfo &BI = BII->second;
697 
698  // This is a non-entry block that is WQM throughout, so no need to do
699  // anything.
700  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
701  return;
702 
703  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
704  << ":\n");
705 
706  unsigned SavedWQMReg = 0;
707  unsigned SavedNonWWMReg = 0;
708  bool WQMFromExec = isEntry;
709  char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
710  char NonWWMState = 0;
711  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
712 
713  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
714  if (isEntry)
715  ++II; // Skip the instruction that saves LiveMask
716 
717  // This stores the first instruction where it's safe to switch from WQM to
718  // Exact or vice versa.
719  MachineBasicBlock::iterator FirstWQM = IE;
720 
721  // This stores the first instruction where it's safe to switch from WWM to
722  // Exact/WQM or to switch to WWM. It must always be the same as, or after,
723  // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
724  // switch to/from WQM as well.
725  MachineBasicBlock::iterator FirstWWM = IE;
726  for (;;) {
727  MachineBasicBlock::iterator Next = II;
728  char Needs = StateExact | StateWQM; // WWM is disabled by default
729  char OutNeeds = 0;
730 
731  if (FirstWQM == IE)
732  FirstWQM = II;
733 
734  if (FirstWWM == IE)
735  FirstWWM = II;
736 
737  // First, figure out the allowed states (Needs) based on the propagated
738  // flags.
739  if (II != IE) {
740  MachineInstr &MI = *II;
741 
742  if (requiresCorrectState(MI)) {
743  auto III = Instructions.find(&MI);
744  if (III != Instructions.end()) {
745  if (III->second.Needs & StateWWM)
746  Needs = StateWWM;
747  else if (III->second.Needs & StateWQM)
748  Needs = StateWQM;
749  else
750  Needs &= ~III->second.Disabled;
751  OutNeeds = III->second.OutNeeds;
752  }
753  } else {
754  // If the instruction doesn't actually need a correct EXEC, then we can
755  // safely leave WWM enabled.
756  Needs = StateExact | StateWQM | StateWWM;
757  }
758 
759  if (MI.isTerminator() && OutNeeds == StateExact)
760  Needs = StateExact;
761 
762  if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
763  MI.getOperand(3).setImm(1);
764 
765  ++Next;
766  } else {
767  // End of basic block
768  if (BI.OutNeeds & StateWQM)
769  Needs = StateWQM;
770  else if (BI.OutNeeds == StateExact)
771  Needs = StateExact;
772  else
773  Needs = StateWQM | StateExact;
774  }
775 
776  // Now, transition if necessary.
777  if (!(Needs & State)) {
779  if (State == StateWWM || Needs == StateWWM) {
780  // We must switch to or from WWM
781  First = FirstWWM;
782  } else {
783  // We only need to switch to/from WQM, so we can use FirstWQM
784  First = FirstWQM;
785  }
786 
788  prepareInsertion(MBB, First, II, Needs == StateWQM,
789  Needs == StateExact || WQMFromExec);
790 
791  if (State == StateWWM) {
792  assert(SavedNonWWMReg);
793  fromWWM(MBB, Before, SavedNonWWMReg);
794  State = NonWWMState;
795  }
796 
797  if (Needs == StateWWM) {
798  NonWWMState = State;
799  SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
800  toWWM(MBB, Before, SavedNonWWMReg);
801  State = StateWWM;
802  } else {
803  if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
804  if (!WQMFromExec && (OutNeeds & StateWQM))
805  SavedWQMReg = MRI->createVirtualRegister(BoolRC);
806 
807  toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
808  State = StateExact;
809  } else if (State == StateExact && (Needs & StateWQM) &&
810  !(Needs & StateExact)) {
811  assert(WQMFromExec == (SavedWQMReg == 0));
812 
813  toWQM(MBB, Before, SavedWQMReg);
814 
815  if (SavedWQMReg) {
816  LIS->createAndComputeVirtRegInterval(SavedWQMReg);
817  SavedWQMReg = 0;
818  }
819  State = StateWQM;
820  } else {
821  // We can get here if we transitioned from WWM to a non-WWM state that
822  // already matches our needs, but we shouldn't need to do anything.
823  assert(Needs & State);
824  }
825  }
826  }
827 
828  if (Needs != (StateExact | StateWQM | StateWWM)) {
829  if (Needs != (StateExact | StateWQM))
830  FirstWQM = IE;
831  FirstWWM = IE;
832  }
833 
834  if (II == IE)
835  break;
836  II = Next;
837  }
838 }
839 
840 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
841  for (MachineInstr *MI : LiveMaskQueries) {
842  const DebugLoc &DL = MI->getDebugLoc();
843  Register Dest = MI->getOperand(0).getReg();
844  MachineInstr *Copy =
845  BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
846  .addReg(LiveMaskReg);
847 
848  LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
849  MI->eraseFromParent();
850  }
851 }
852 
853 void SIWholeQuadMode::lowerCopyInstrs() {
854  for (MachineInstr *MI : LowerToCopyInstrs) {
855  for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
856  MI->RemoveOperand(i);
857 
858  const Register Reg = MI->getOperand(0).getReg();
859 
860  if (TRI->isVGPR(*MRI, Reg)) {
861  const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg)
862  ? MRI->getRegClass(Reg)
863  : TRI->getPhysRegClass(Reg);
864 
865  const unsigned MovOp = TII->getMovOpcode(regClass);
866  MI->setDesc(TII->get(MovOp));
867 
868  // And make it implicitly depend on exec (like all VALU movs should do).
869  MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
870  } else {
871  MI->setDesc(TII->get(AMDGPU::COPY));
872  }
873  }
874 }
875 
876 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
877  Instructions.clear();
878  Blocks.clear();
879  LiveMaskQueries.clear();
880  LowerToCopyInstrs.clear();
882 
883  ST = &MF.getSubtarget<GCNSubtarget>();
884 
885  TII = ST->getInstrInfo();
886  TRI = &TII->getRegisterInfo();
887  MRI = &MF.getRegInfo();
888  LIS = &getAnalysis<LiveIntervals>();
889 
890  char GlobalFlags = analyzeFunction(MF);
891  unsigned LiveMaskReg = 0;
892  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
893  if (!(GlobalFlags & StateWQM)) {
894  lowerLiveMaskQueries(Exec);
895  if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
896  return !LiveMaskQueries.empty();
897  } else {
898  // Store a copy of the original live mask when required
899  MachineBasicBlock &Entry = MF.front();
901 
902  if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
903  LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
904  MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
905  TII->get(AMDGPU::COPY), LiveMaskReg)
906  .addReg(Exec);
907  LIS->InsertMachineInstrInMaps(*MI);
908  }
909 
910  lowerLiveMaskQueries(LiveMaskReg);
911 
912  if (GlobalFlags == StateWQM) {
913  // For a shader that needs only WQM, we can just set it once.
914  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
915  AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
916  Exec)
917  .addReg(Exec);
918 
919  lowerCopyInstrs();
920  // EntryMI may become invalid here
921  return true;
922  }
923  }
924 
925  LLVM_DEBUG(printInfo());
926 
927  lowerCopyInstrs();
928 
929  // Handle the general case
930  for (auto BII : Blocks)
931  processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
932 
933  // Physical registers like SCC aren't tracked by default anyway, so just
934  // removing the ranges we computed is the simplest option for maintaining
935  // the analysis results.
936  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
937 
938  return true;
939 }
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
Definition: LiveInterval.h:77
char & SIWholeQuadModeID
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
AMDGPU specific subclass of TargetSubtarget.
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:241
SlotIndex def
The index of the defining instruction.
Definition: LiveInterval.h:60
SI Whole Quad Mode
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:484
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
Definition: MachineInstr.h:509
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_END(SIWholeQuadMode
void push_back(const T &Elt)
Definition: SmallVector.h:211
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:384
unsigned Reg
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
bool isTransient() const
Return true if this is a transient instruction that is either very likely to be eliminated during reg...
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:161
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
VNInfo - Value Number Information.
Definition: LiveInterval.h:52
iterator_range< succ_iterator > successors()
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:156
static const AMDGPUSubtarget & get(const MachineFunction &MF)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
SlotIndex getNextIndex() const
Returns the next index.
Definition: SlotIndexes.h:279
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
bool isTerminator(QueryType Type=AnyInBundle) const
Returns true if this instruction part of the terminator for a basic block.
Definition: MachineInstr.h:672
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:410
SlotIndexes pass.
Definition: SlotIndexes.h:314
reverse_iterator rbegin()
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:532
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:131
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:843
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
FunctionPass * createSIWholeQuadModePass()
Represent the analysis usage information of a pass.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:498
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
iterator_range< pred_iterator > predecessors()
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
const MachineBasicBlock & front() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
MachineInstrBuilder MachineInstrBuilder & DefMI
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
#define DEBUG_TYPE
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:255
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:2047
bool isReg() const
isReg - Tests if this is a MO_Register operand.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:399
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:74
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:69
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
void RemoveOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with...
Register getReg() const
getReg - Returns the register number.
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:415
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:83
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
const SIRegisterInfo * getRegisterInfo() const override