LLVM  9.0.0svn
SIWholeQuadMode.cpp
Go to the documentation of this file.
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode for pixel
11 /// shaders, and whole wavefront mode for all programs.
12 ///
13 /// Whole quad mode is required for derivative computations, but it interferes
14 /// with shader side effects (stores and atomics). This pass is run on the
15 /// scheduled machine IR but before register coalescing, so that machine SSA is
16 /// available for analysis. It ensures that WQM is enabled when necessary, but
17 /// disabled around stores and atomics.
18 ///
19 /// When necessary, this pass creates a function prolog
20 ///
21 /// S_MOV_B64 LiveMask, EXEC
22 /// S_WQM_B64 EXEC, EXEC
23 ///
24 /// to enter WQM at the top of the function and surrounds blocks of Exact
25 /// instructions by
26 ///
27 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
28 /// ...
29 /// S_MOV_B64 EXEC, Tmp
30 ///
31 /// We also compute when a sequence of instructions requires Whole Wavefront
32 /// Mode (WWM) and insert instructions to save and restore it:
33 ///
34 /// S_OR_SAVEEXEC_B64 Tmp, -1
35 /// ...
36 /// S_MOV_B64 EXEC, Tmp
37 ///
38 /// In order to avoid excessive switching during sequences of Exact
39 /// instructions, the pass first analyzes which instructions must be run in WQM
40 /// (aka which instructions produce values that lead to derivative
41 /// computations).
42 ///
43 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
44 ///
45 /// There is room for improvement given better control flow analysis:
46 ///
47 /// (1) at the top level (outside of control flow statements, and as long as
48 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
49 /// the LiveMask (this is implemented for the entry block).
50 ///
51 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
52 /// consist of exact and don't-care instructions, the switch only has to
53 /// be done at the entry and exit points rather than potentially in each
54 /// block of the region.
55 ///
56 //===----------------------------------------------------------------------===//
57 
58 #include "AMDGPU.h"
59 #include "AMDGPUSubtarget.h"
60 #include "SIInstrInfo.h"
61 #include "SIMachineFunctionInfo.h"
63 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/SmallVector.h"
66 #include "llvm/ADT/StringRef.h"
78 #include "llvm/IR/CallingConv.h"
79 #include "llvm/IR/DebugLoc.h"
80 #include "llvm/MC/MCRegisterInfo.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
84 #include <cassert>
85 #include <vector>
86 
87 using namespace llvm;
88 
89 #define DEBUG_TYPE "si-wqm"
90 
91 namespace {
92 
93 enum {
94  StateWQM = 0x1,
95  StateWWM = 0x2,
96  StateExact = 0x4,
97 };
98 
99 struct PrintState {
100 public:
101  int State;
102 
103  explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108  if (PS.State & StateWQM)
109  OS << "WQM";
110  if (PS.State & StateWWM) {
111  if (PS.State & StateWQM)
112  OS << '|';
113  OS << "WWM";
114  }
115  if (PS.State & StateExact) {
116  if (PS.State & (StateWQM | StateWWM))
117  OS << '|';
118  OS << "Exact";
119  }
120 
121  return OS;
122 }
123 #endif
124 
125 struct InstrInfo {
126  char Needs = 0;
127  char Disabled = 0;
128  char OutNeeds = 0;
129 };
130 
131 struct BlockInfo {
132  char Needs = 0;
133  char InNeeds = 0;
134  char OutNeeds = 0;
135 };
136 
137 struct WorkItem {
138  MachineBasicBlock *MBB = nullptr;
139  MachineInstr *MI = nullptr;
140 
141  WorkItem() = default;
142  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
143  WorkItem(MachineInstr *MI) : MI(MI) {}
144 };
145 
146 class SIWholeQuadMode : public MachineFunctionPass {
147 private:
149  const SIInstrInfo *TII;
150  const SIRegisterInfo *TRI;
152  LiveIntervals *LIS;
153 
156  SmallVector<MachineInstr *, 1> LiveMaskQueries;
157  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
158 
159  void printInfo();
160 
161  void markInstruction(MachineInstr &MI, char Flag,
162  std::vector<WorkItem> &Worklist);
163  void markInstructionUses(const MachineInstr &MI, char Flag,
164  std::vector<WorkItem> &Worklist);
165  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
166  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
167  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
168  char analyzeFunction(MachineFunction &MF);
169 
170  bool requiresCorrectState(const MachineInstr &MI) const;
171 
175  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
176  MachineBasicBlock::iterator Last, bool PreferLast,
177  bool SaveSCC);
178  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
179  unsigned SaveWQM, unsigned LiveMaskReg);
180  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
181  unsigned SavedWQM);
182  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
183  unsigned SaveOrig);
184  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
185  unsigned SavedOrig);
186  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
187 
188  void lowerLiveMaskQueries(unsigned LiveMaskReg);
189  void lowerCopyInstrs();
190 
191 public:
192  static char ID;
193 
194  SIWholeQuadMode() :
195  MachineFunctionPass(ID) { }
196 
197  bool runOnMachineFunction(MachineFunction &MF) override;
198 
199  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
200 
201  void getAnalysisUsage(AnalysisUsage &AU) const override {
205  AU.setPreservesCFG();
207  }
208 };
209 
210 } // end anonymous namespace
211 
212 char SIWholeQuadMode::ID = 0;
213 
214 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
215  false)
217 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
218  false)
219 
220 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
221 
223  return new SIWholeQuadMode;
224 }
225 
226 #ifndef NDEBUG
227 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
228  for (const auto &BII : Blocks) {
229  dbgs() << "\n"
230  << printMBBReference(*BII.first) << ":\n"
231  << " InNeeds = " << PrintState(BII.second.InNeeds)
232  << ", Needs = " << PrintState(BII.second.Needs)
233  << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
234 
235  for (const MachineInstr &MI : *BII.first) {
236  auto III = Instructions.find(&MI);
237  if (III == Instructions.end())
238  continue;
239 
240  dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
241  << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
242  }
243  }
244 }
245 #endif
246 
247 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
248  std::vector<WorkItem> &Worklist) {
249  InstrInfo &II = Instructions[&MI];
250 
251  assert(!(Flag & StateExact) && Flag != 0);
252 
253  // Remove any disabled states from the flag. The user that required it gets
254  // an undefined value in the helper lanes. For example, this can happen if
255  // the result of an atomic is used by instruction that requires WQM, where
256  // ignoring the request for WQM is correct as per the relevant specs.
257  Flag &= ~II.Disabled;
258 
259  // Ignore if the flag is already encompassed by the existing needs, or we
260  // just disabled everything.
261  if ((II.Needs & Flag) == Flag)
262  return;
263 
264  II.Needs |= Flag;
265  Worklist.push_back(&MI);
266 }
267 
268 /// Mark all instructions defining the uses in \p MI with \p Flag.
269 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
270  std::vector<WorkItem> &Worklist) {
271  for (const MachineOperand &Use : MI.uses()) {
272  if (!Use.isReg() || !Use.isUse())
273  continue;
274 
275  unsigned Reg = Use.getReg();
276 
277  // Handle physical registers that we need to track; this is mostly relevant
278  // for VCC, which can appear as the (implicit) input of a uniform branch,
279  // e.g. when a loop counter is stored in a VGPR.
281  if (Reg == AMDGPU::EXEC)
282  continue;
283 
284  for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
285  LiveRange &LR = LIS->getRegUnit(*RegUnit);
286  const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
287  if (!Value)
288  continue;
289 
290  // Since we're in machine SSA, we do not need to track physical
291  // registers across basic blocks.
292  if (Value->isPHIDef())
293  continue;
294 
295  markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
296  Worklist);
297  }
298 
299  continue;
300  }
301 
302  for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
303  markInstruction(DefMI, Flag, Worklist);
304  }
305 }
306 
307 // Scan instructions to determine which ones require an Exact execmask and
308 // which ones seed WQM requirements.
309 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
310  std::vector<WorkItem> &Worklist) {
311  char GlobalFlags = 0;
312  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
313  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
314 
315  // We need to visit the basic blocks in reverse post-order so that we visit
316  // defs before uses, in particular so that we don't accidentally mark an
317  // instruction as needing e.g. WQM before visiting it and realizing it needs
318  // WQM disabled.
320  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
321  MachineBasicBlock &MBB = **BI;
322  BlockInfo &BBI = Blocks[&MBB];
323 
324  for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
325  MachineInstr &MI = *II;
326  InstrInfo &III = Instructions[&MI];
327  unsigned Opcode = MI.getOpcode();
328  char Flags = 0;
329 
330  if (TII->isWQM(Opcode)) {
331  // Sampling instructions don't need to produce results for all pixels
332  // in a quad, they just require all inputs of a quad to have been
333  // computed for derivatives.
334  markInstructionUses(MI, StateWQM, Worklist);
335  GlobalFlags |= StateWQM;
336  continue;
337  } else if (Opcode == AMDGPU::WQM) {
338  // The WQM intrinsic requires its output to have all the helper lanes
339  // correct, so we need it to be in WQM.
340  Flags = StateWQM;
341  LowerToCopyInstrs.push_back(&MI);
342  } else if (Opcode == AMDGPU::WWM) {
343  // The WWM intrinsic doesn't make the same guarantee, and plus it needs
344  // to be executed in WQM or Exact so that its copy doesn't clobber
345  // inactive lanes.
346  markInstructionUses(MI, StateWWM, Worklist);
347  GlobalFlags |= StateWWM;
348  LowerToCopyInstrs.push_back(&MI);
349  continue;
350  } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
351  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
352  III.Disabled = StateWWM;
353  MachineOperand &Inactive = MI.getOperand(2);
354  if (Inactive.isReg()) {
355  if (Inactive.isUndef()) {
356  LowerToCopyInstrs.push_back(&MI);
357  } else {
358  unsigned Reg = Inactive.getReg();
360  for (MachineInstr &DefMI : MRI->def_instructions(Reg))
361  markInstruction(DefMI, StateWWM, Worklist);
362  }
363  }
364  }
365  SetInactiveInstrs.push_back(&MI);
366  continue;
367  } else if (TII->isDisableWQM(MI)) {
368  BBI.Needs |= StateExact;
369  if (!(BBI.InNeeds & StateExact)) {
370  BBI.InNeeds |= StateExact;
371  Worklist.push_back(&MBB);
372  }
373  GlobalFlags |= StateExact;
374  III.Disabled = StateWQM | StateWWM;
375  continue;
376  } else {
377  if (Opcode == AMDGPU::SI_PS_LIVE) {
378  LiveMaskQueries.push_back(&MI);
379  } else if (WQMOutputs) {
380  // The function is in machine SSA form, which means that physical
381  // VGPRs correspond to shader inputs and outputs. Inputs are
382  // only used, outputs are only defined.
383  for (const MachineOperand &MO : MI.defs()) {
384  if (!MO.isReg())
385  continue;
386 
387  unsigned Reg = MO.getReg();
388 
389  if (!TRI->isVirtualRegister(Reg) &&
390  TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
391  Flags = StateWQM;
392  break;
393  }
394  }
395  }
396 
397  if (!Flags)
398  continue;
399  }
400 
401  markInstruction(MI, Flags, Worklist);
402  GlobalFlags |= Flags;
403  }
404  }
405 
406  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
407  // ever used anywhere in the function. This implements the corresponding
408  // semantics of @llvm.amdgcn.set.inactive.
409  if (GlobalFlags & StateWQM) {
410  for (MachineInstr *MI : SetInactiveInstrs)
411  markInstruction(*MI, StateWQM, Worklist);
412  }
413 
414  return GlobalFlags;
415 }
416 
417 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
418  std::vector<WorkItem>& Worklist) {
419  MachineBasicBlock *MBB = MI.getParent();
420  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
421  BlockInfo &BI = Blocks[MBB];
422 
423  // Control flow-type instructions and stores to temporary memory that are
424  // followed by WQM computations must themselves be in WQM.
425  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
426  (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
427  Instructions[&MI].Needs = StateWQM;
428  II.Needs = StateWQM;
429  }
430 
431  // Propagate to block level
432  if (II.Needs & StateWQM) {
433  BI.Needs |= StateWQM;
434  if (!(BI.InNeeds & StateWQM)) {
435  BI.InNeeds |= StateWQM;
436  Worklist.push_back(MBB);
437  }
438  }
439 
440  // Propagate backwards within block
441  if (MachineInstr *PrevMI = MI.getPrevNode()) {
442  char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
443  if (!PrevMI->isPHI()) {
444  InstrInfo &PrevII = Instructions[PrevMI];
445  if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
446  PrevII.OutNeeds |= InNeeds;
447  Worklist.push_back(PrevMI);
448  }
449  }
450  }
451 
452  // Propagate WQM flag to instruction inputs
453  assert(!(II.Needs & StateExact));
454 
455  if (II.Needs != 0)
456  markInstructionUses(MI, II.Needs, Worklist);
457 
458  // Ensure we process a block containing WWM, even if it does not require any
459  // WQM transitions.
460  if (II.Needs & StateWWM)
461  BI.Needs |= StateWWM;
462 }
463 
464 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
465  std::vector<WorkItem>& Worklist) {
466  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
467 
468  // Propagate through instructions
469  if (!MBB.empty()) {
470  MachineInstr *LastMI = &*MBB.rbegin();
471  InstrInfo &LastII = Instructions[LastMI];
472  if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
473  LastII.OutNeeds |= BI.OutNeeds;
474  Worklist.push_back(LastMI);
475  }
476  }
477 
478  // Predecessor blocks must provide for our WQM/Exact needs.
479  for (MachineBasicBlock *Pred : MBB.predecessors()) {
480  BlockInfo &PredBI = Blocks[Pred];
481  if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
482  continue;
483 
484  PredBI.OutNeeds |= BI.InNeeds;
485  PredBI.InNeeds |= BI.InNeeds;
486  Worklist.push_back(Pred);
487  }
488 
489  // All successors must be prepared to accept the same set of WQM/Exact data.
490  for (MachineBasicBlock *Succ : MBB.successors()) {
491  BlockInfo &SuccBI = Blocks[Succ];
492  if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
493  continue;
494 
495  SuccBI.InNeeds |= BI.OutNeeds;
496  Worklist.push_back(Succ);
497  }
498 }
499 
500 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
501  std::vector<WorkItem> Worklist;
502  char GlobalFlags = scanInstructions(MF, Worklist);
503 
504  while (!Worklist.empty()) {
505  WorkItem WI = Worklist.back();
506  Worklist.pop_back();
507 
508  if (WI.MI)
509  propagateInstruction(*WI.MI, Worklist);
510  else
511  propagateBlock(*WI.MBB, Worklist);
512  }
513 
514  return GlobalFlags;
515 }
516 
517 /// Whether \p MI really requires the exec state computed during analysis.
518 ///
519 /// Scalar instructions must occasionally be marked WQM for correct propagation
520 /// (e.g. thread masks leading up to branches), but when it comes to actual
521 /// execution, they don't care about EXEC.
522 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
523  if (MI.isTerminator())
524  return true;
525 
526  // Skip instructions that are not affected by EXEC
527  if (TII->isScalarUnit(MI))
528  return false;
529 
530  // Generic instructions such as COPY will either disappear by register
531  // coalescing or be lowered to SALU or VALU instructions.
532  if (MI.isTransient()) {
533  if (MI.getNumExplicitOperands() >= 1) {
534  const MachineOperand &Op = MI.getOperand(0);
535  if (Op.isReg()) {
536  if (TRI->isSGPRReg(*MRI, Op.getReg())) {
537  // SGPR instructions are not affected by EXEC
538  return false;
539  }
540  }
541  }
542  }
543 
544  return true;
545 }
546 
548 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
550  unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
551 
552  MachineInstr *Save =
553  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
554  .addReg(AMDGPU::SCC);
555  MachineInstr *Restore =
556  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
557  .addReg(SaveReg);
558 
559  LIS->InsertMachineInstrInMaps(*Save);
560  LIS->InsertMachineInstrInMaps(*Restore);
561  LIS->createAndComputeVirtRegInterval(SaveReg);
562 
563  return Restore;
564 }
565 
566 // Return an iterator in the (inclusive) range [First, Last] at which
567 // instructions can be safely inserted, keeping in mind that some of the
568 // instructions we want to add necessarily clobber SCC.
569 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
571  MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
572  if (!SaveSCC)
573  return PreferLast ? Last : First;
574 
575  LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
576  auto MBBE = MBB.end();
577  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
578  : LIS->getMBBEndIdx(&MBB);
579  SlotIndex LastIdx =
580  Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
581  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
582  const LiveRange::Segment *S;
583 
584  for (;;) {
585  S = LR.getSegmentContaining(Idx);
586  if (!S)
587  break;
588 
589  if (PreferLast) {
590  SlotIndex Next = S->start.getBaseIndex();
591  if (Next < FirstIdx)
592  break;
593  Idx = Next;
594  } else {
595  SlotIndex Next = S->end.getNextIndex().getBaseIndex();
596  if (Next > LastIdx)
597  break;
598  Idx = Next;
599  }
600  }
601 
603 
604  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
605  MBBI = MI;
606  else {
607  assert(Idx == LIS->getMBBEndIdx(&MBB));
608  MBBI = MBB.end();
609  }
610 
611  if (S)
612  MBBI = saveSCC(MBB, MBBI);
613 
614  return MBBI;
615 }
616 
617 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
619  unsigned SaveWQM, unsigned LiveMaskReg) {
620  MachineInstr *MI;
621 
622  if (SaveWQM) {
623  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
624  SaveWQM)
625  .addReg(LiveMaskReg);
626  } else {
627  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
628  AMDGPU::EXEC)
629  .addReg(AMDGPU::EXEC)
630  .addReg(LiveMaskReg);
631  }
632 
633  LIS->InsertMachineInstrInMaps(*MI);
634 }
635 
636 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
638  unsigned SavedWQM) {
639  MachineInstr *MI;
640 
641  if (SavedWQM) {
642  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
643  .addReg(SavedWQM);
644  } else {
645  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
646  AMDGPU::EXEC)
647  .addReg(AMDGPU::EXEC);
648  }
649 
650  LIS->InsertMachineInstrInMaps(*MI);
651 }
652 
653 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
655  unsigned SaveOrig) {
656  MachineInstr *MI;
657 
658  assert(SaveOrig);
659  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
660  .addImm(-1);
661  LIS->InsertMachineInstrInMaps(*MI);
662 }
663 
664 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
666  unsigned SavedOrig) {
667  MachineInstr *MI;
668 
669  assert(SavedOrig);
670  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
671  .addReg(SavedOrig);
672  LIS->InsertMachineInstrInMaps(*MI);
673 }
674 
675 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
676  bool isEntry) {
677  auto BII = Blocks.find(&MBB);
678  if (BII == Blocks.end())
679  return;
680 
681  const BlockInfo &BI = BII->second;
682 
683  // This is a non-entry block that is WQM throughout, so no need to do
684  // anything.
685  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
686  return;
687 
688  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
689  << ":\n");
690 
691  unsigned SavedWQMReg = 0;
692  unsigned SavedNonWWMReg = 0;
693  bool WQMFromExec = isEntry;
694  char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
695  char NonWWMState = 0;
696 
697  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
698  if (isEntry)
699  ++II; // Skip the instruction that saves LiveMask
700 
701  // This stores the first instruction where it's safe to switch from WQM to
702  // Exact or vice versa.
703  MachineBasicBlock::iterator FirstWQM = IE;
704 
705  // This stores the first instruction where it's safe to switch from WWM to
706  // Exact/WQM or to switch to WWM. It must always be the same as, or after,
707  // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
708  // switch to/from WQM as well.
709  MachineBasicBlock::iterator FirstWWM = IE;
710  for (;;) {
711  MachineBasicBlock::iterator Next = II;
712  char Needs = StateExact | StateWQM; // WWM is disabled by default
713  char OutNeeds = 0;
714 
715  if (FirstWQM == IE)
716  FirstWQM = II;
717 
718  if (FirstWWM == IE)
719  FirstWWM = II;
720 
721  // First, figure out the allowed states (Needs) based on the propagated
722  // flags.
723  if (II != IE) {
724  MachineInstr &MI = *II;
725 
726  if (requiresCorrectState(MI)) {
727  auto III = Instructions.find(&MI);
728  if (III != Instructions.end()) {
729  if (III->second.Needs & StateWWM)
730  Needs = StateWWM;
731  else if (III->second.Needs & StateWQM)
732  Needs = StateWQM;
733  else
734  Needs &= ~III->second.Disabled;
735  OutNeeds = III->second.OutNeeds;
736  }
737  } else {
738  // If the instruction doesn't actually need a correct EXEC, then we can
739  // safely leave WWM enabled.
740  Needs = StateExact | StateWQM | StateWWM;
741  }
742 
743  if (MI.isTerminator() && OutNeeds == StateExact)
744  Needs = StateExact;
745 
746  if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
747  MI.getOperand(3).setImm(1);
748 
749  ++Next;
750  } else {
751  // End of basic block
752  if (BI.OutNeeds & StateWQM)
753  Needs = StateWQM;
754  else if (BI.OutNeeds == StateExact)
755  Needs = StateExact;
756  else
757  Needs = StateWQM | StateExact;
758  }
759 
760  // Now, transition if necessary.
761  if (!(Needs & State)) {
763  if (State == StateWWM || Needs == StateWWM) {
764  // We must switch to or from WWM
765  First = FirstWWM;
766  } else {
767  // We only need to switch to/from WQM, so we can use FirstWQM
768  First = FirstWQM;
769  }
770 
772  prepareInsertion(MBB, First, II, Needs == StateWQM,
773  Needs == StateExact || WQMFromExec);
774 
775  if (State == StateWWM) {
776  assert(SavedNonWWMReg);
777  fromWWM(MBB, Before, SavedNonWWMReg);
778  State = NonWWMState;
779  }
780 
781  if (Needs == StateWWM) {
782  NonWWMState = State;
783  SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
784  toWWM(MBB, Before, SavedNonWWMReg);
785  State = StateWWM;
786  } else {
787  if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
788  if (!WQMFromExec && (OutNeeds & StateWQM))
789  SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
790 
791  toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
792  State = StateExact;
793  } else if (State == StateExact && (Needs & StateWQM) &&
794  !(Needs & StateExact)) {
795  assert(WQMFromExec == (SavedWQMReg == 0));
796 
797  toWQM(MBB, Before, SavedWQMReg);
798 
799  if (SavedWQMReg) {
800  LIS->createAndComputeVirtRegInterval(SavedWQMReg);
801  SavedWQMReg = 0;
802  }
803  State = StateWQM;
804  } else {
805  // We can get here if we transitioned from WWM to a non-WWM state that
806  // already matches our needs, but we shouldn't need to do anything.
807  assert(Needs & State);
808  }
809  }
810  }
811 
812  if (Needs != (StateExact | StateWQM | StateWWM)) {
813  if (Needs != (StateExact | StateWQM))
814  FirstWQM = IE;
815  FirstWWM = IE;
816  }
817 
818  if (II == IE)
819  break;
820  II = Next;
821  }
822 }
823 
824 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
825  for (MachineInstr *MI : LiveMaskQueries) {
826  const DebugLoc &DL = MI->getDebugLoc();
827  unsigned Dest = MI->getOperand(0).getReg();
828  MachineInstr *Copy =
829  BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
830  .addReg(LiveMaskReg);
831 
832  LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
833  MI->eraseFromParent();
834  }
835 }
836 
837 void SIWholeQuadMode::lowerCopyInstrs() {
838  for (MachineInstr *MI : LowerToCopyInstrs) {
839  for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
840  MI->RemoveOperand(i);
841 
842  const unsigned Reg = MI->getOperand(0).getReg();
843 
844  if (TRI->isVGPR(*MRI, Reg)) {
845  const TargetRegisterClass *regClass =
847  ? MRI->getRegClass(Reg)
848  : TRI->getPhysRegClass(Reg);
849 
850  const unsigned MovOp = TII->getMovOpcode(regClass);
851  MI->setDesc(TII->get(MovOp));
852 
853  // And make it implicitly depend on exec (like all VALU movs should do).
854  MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
855  } else {
856  MI->setDesc(TII->get(AMDGPU::COPY));
857  }
858  }
859 }
860 
861 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
862  Instructions.clear();
863  Blocks.clear();
864  LiveMaskQueries.clear();
865  LowerToCopyInstrs.clear();
867 
868  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
869 
870  TII = ST.getInstrInfo();
871  TRI = &TII->getRegisterInfo();
872  MRI = &MF.getRegInfo();
873  LIS = &getAnalysis<LiveIntervals>();
874 
875  char GlobalFlags = analyzeFunction(MF);
876  unsigned LiveMaskReg = 0;
877  if (!(GlobalFlags & StateWQM)) {
878  lowerLiveMaskQueries(AMDGPU::EXEC);
879  if (!(GlobalFlags & StateWWM))
880  return !LiveMaskQueries.empty();
881  } else {
882  // Store a copy of the original live mask when required
883  MachineBasicBlock &Entry = MF.front();
885 
886  if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
887  LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
888  MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
889  TII->get(AMDGPU::COPY), LiveMaskReg)
890  .addReg(AMDGPU::EXEC);
891  LIS->InsertMachineInstrInMaps(*MI);
892  }
893 
894  lowerLiveMaskQueries(LiveMaskReg);
895 
896  if (GlobalFlags == StateWQM) {
897  // For a shader that needs only WQM, we can just set it once.
898  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
899  AMDGPU::EXEC)
900  .addReg(AMDGPU::EXEC);
901 
902  lowerCopyInstrs();
903  // EntryMI may become invalid here
904  return true;
905  }
906  }
907 
908  LLVM_DEBUG(printInfo());
909 
910  lowerCopyInstrs();
911 
912  // Handle the general case
913  for (auto BII : Blocks)
914  processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
915 
916  // Physical registers like SCC aren't tracked by default anyway, so just
917  // removing the ranges we computed is the simplest option for maintaining
918  // the analysis results.
919  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
920 
921  return true;
922 }
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
Definition: LiveInterval.h:77
char & SIWholeQuadModeID
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
AMDGPU specific subclass of TargetSubtarget.
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:241
SlotIndex def
The index of the defining instruction.
Definition: LiveInterval.h:60
SI Whole Quad Mode
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:473
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
Definition: MachineInstr.h:493
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_END(SIWholeQuadMode
void push_back(const T &Elt)
Definition: SmallVector.h:211
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:384
unsigned getReg() const
getReg - Returns the register number.
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
unsigned Reg
const SIInstrInfo * getInstrInfo() const override
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
bool isTransient() const
Return true if this is a transient instruction that is either very likely to be eliminated during reg...
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:161
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
VNInfo - Value Number Information.
Definition: LiveInterval.h:52
iterator_range< succ_iterator > successors()
static MachineOperand CreateReg(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:156
static const AMDGPUSubtarget & get(const MachineFunction &MF)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
SlotIndex getNextIndex() const
Returns the next index.
Definition: SlotIndexes.h:279
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
bool isTerminator(QueryType Type=AnyInBundle) const
Returns true if this instruction part of the terminator for a basic block.
Definition: MachineInstr.h:650
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:410
SlotIndexes pass.
Definition: SlotIndexes.h:328
reverse_iterator rbegin()
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:528
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:117
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:821
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
FunctionPass * createSIWholeQuadModePass()
Represent the analysis usage information of a pass.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:482
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
iterator_range< pred_iterator > predecessors()
const MachineBasicBlock & front() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
MachineInstrBuilder MachineInstrBuilder & DefMI
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
#define DEBUG_TYPE
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:255
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:2038
bool isReg() const
isReg - Tests if this is a MO_Register operand.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:395
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:72
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
void RemoveOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with...
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:415
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:83
const SIRegisterInfo * getRegisterInfo() const override