LLVM  9.0.0svn
SIWholeQuadMode.cpp
Go to the documentation of this file.
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode for pixel
11 /// shaders, and whole wavefront mode for all programs.
12 ///
13 /// Whole quad mode is required for derivative computations, but it interferes
14 /// with shader side effects (stores and atomics). This pass is run on the
15 /// scheduled machine IR but before register coalescing, so that machine SSA is
16 /// available for analysis. It ensures that WQM is enabled when necessary, but
17 /// disabled around stores and atomics.
18 ///
19 /// When necessary, this pass creates a function prolog
20 ///
21 /// S_MOV_B64 LiveMask, EXEC
22 /// S_WQM_B64 EXEC, EXEC
23 ///
24 /// to enter WQM at the top of the function and surrounds blocks of Exact
25 /// instructions by
26 ///
27 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
28 /// ...
29 /// S_MOV_B64 EXEC, Tmp
30 ///
31 /// We also compute when a sequence of instructions requires Whole Wavefront
32 /// Mode (WWM) and insert instructions to save and restore it:
33 ///
34 /// S_OR_SAVEEXEC_B64 Tmp, -1
35 /// ...
36 /// S_MOV_B64 EXEC, Tmp
37 ///
38 /// In order to avoid excessive switching during sequences of Exact
39 /// instructions, the pass first analyzes which instructions must be run in WQM
40 /// (aka which instructions produce values that lead to derivative
41 /// computations).
42 ///
43 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
44 ///
45 /// There is room for improvement given better control flow analysis:
46 ///
47 /// (1) at the top level (outside of control flow statements, and as long as
48 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
49 /// the LiveMask (this is implemented for the entry block).
50 ///
51 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
52 /// consist of exact and don't-care instructions, the switch only has to
53 /// be done at the entry and exit points rather than potentially in each
54 /// block of the region.
55 ///
56 //===----------------------------------------------------------------------===//
57 
58 #include "AMDGPU.h"
59 #include "AMDGPUSubtarget.h"
60 #include "SIInstrInfo.h"
61 #include "SIMachineFunctionInfo.h"
63 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/SmallVector.h"
66 #include "llvm/ADT/StringRef.h"
78 #include "llvm/IR/CallingConv.h"
79 #include "llvm/IR/DebugLoc.h"
80 #include "llvm/MC/MCRegisterInfo.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
84 #include <cassert>
85 #include <vector>
86 
87 using namespace llvm;
88 
89 #define DEBUG_TYPE "si-wqm"
90 
91 namespace {
92 
93 enum {
94  StateWQM = 0x1,
95  StateWWM = 0x2,
96  StateExact = 0x4,
97 };
98 
99 struct PrintState {
100 public:
101  int State;
102 
103  explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108  if (PS.State & StateWQM)
109  OS << "WQM";
110  if (PS.State & StateWWM) {
111  if (PS.State & StateWQM)
112  OS << '|';
113  OS << "WWM";
114  }
115  if (PS.State & StateExact) {
116  if (PS.State & (StateWQM | StateWWM))
117  OS << '|';
118  OS << "Exact";
119  }
120 
121  return OS;
122 }
123 #endif
124 
125 struct InstrInfo {
126  char Needs = 0;
127  char Disabled = 0;
128  char OutNeeds = 0;
129 };
130 
131 struct BlockInfo {
132  char Needs = 0;
133  char InNeeds = 0;
134  char OutNeeds = 0;
135 };
136 
137 struct WorkItem {
138  MachineBasicBlock *MBB = nullptr;
139  MachineInstr *MI = nullptr;
140 
141  WorkItem() = default;
142  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
143  WorkItem(MachineInstr *MI) : MI(MI) {}
144 };
145 
146 class SIWholeQuadMode : public MachineFunctionPass {
147 private:
149  const SIInstrInfo *TII;
150  const SIRegisterInfo *TRI;
151  const GCNSubtarget *ST;
153  LiveIntervals *LIS;
154 
157  SmallVector<MachineInstr *, 1> LiveMaskQueries;
158  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
159 
160  void printInfo();
161 
162  void markInstruction(MachineInstr &MI, char Flag,
163  std::vector<WorkItem> &Worklist);
164  void markInstructionUses(const MachineInstr &MI, char Flag,
165  std::vector<WorkItem> &Worklist);
166  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
167  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
168  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
169  char analyzeFunction(MachineFunction &MF);
170 
171  bool requiresCorrectState(const MachineInstr &MI) const;
172 
176  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
177  MachineBasicBlock::iterator Last, bool PreferLast,
178  bool SaveSCC);
179  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
180  unsigned SaveWQM, unsigned LiveMaskReg);
181  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
182  unsigned SavedWQM);
183  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
184  unsigned SaveOrig);
185  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
186  unsigned SavedOrig);
187  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
188 
189  void lowerLiveMaskQueries(unsigned LiveMaskReg);
190  void lowerCopyInstrs();
191 
192 public:
193  static char ID;
194 
195  SIWholeQuadMode() :
196  MachineFunctionPass(ID) { }
197 
198  bool runOnMachineFunction(MachineFunction &MF) override;
199 
200  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
201 
202  void getAnalysisUsage(AnalysisUsage &AU) const override {
206  AU.setPreservesCFG();
208  }
209 };
210 
211 } // end anonymous namespace
212 
213 char SIWholeQuadMode::ID = 0;
214 
215 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
216  false)
218 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
219  false)
220 
221 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
222 
224  return new SIWholeQuadMode;
225 }
226 
227 #ifndef NDEBUG
228 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
229  for (const auto &BII : Blocks) {
230  dbgs() << "\n"
231  << printMBBReference(*BII.first) << ":\n"
232  << " InNeeds = " << PrintState(BII.second.InNeeds)
233  << ", Needs = " << PrintState(BII.second.Needs)
234  << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
235 
236  for (const MachineInstr &MI : *BII.first) {
237  auto III = Instructions.find(&MI);
238  if (III == Instructions.end())
239  continue;
240 
241  dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
242  << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
243  }
244  }
245 }
246 #endif
247 
248 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
249  std::vector<WorkItem> &Worklist) {
250  InstrInfo &II = Instructions[&MI];
251 
252  assert(!(Flag & StateExact) && Flag != 0);
253 
254  // Remove any disabled states from the flag. The user that required it gets
255  // an undefined value in the helper lanes. For example, this can happen if
256  // the result of an atomic is used by instruction that requires WQM, where
257  // ignoring the request for WQM is correct as per the relevant specs.
258  Flag &= ~II.Disabled;
259 
260  // Ignore if the flag is already encompassed by the existing needs, or we
261  // just disabled everything.
262  if ((II.Needs & Flag) == Flag)
263  return;
264 
265  II.Needs |= Flag;
266  Worklist.push_back(&MI);
267 }
268 
269 /// Mark all instructions defining the uses in \p MI with \p Flag.
270 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
271  std::vector<WorkItem> &Worklist) {
272  for (const MachineOperand &Use : MI.uses()) {
273  if (!Use.isReg() || !Use.isUse())
274  continue;
275 
276  unsigned Reg = Use.getReg();
277 
278  // Handle physical registers that we need to track; this is mostly relevant
279  // for VCC, which can appear as the (implicit) input of a uniform branch,
280  // e.g. when a loop counter is stored in a VGPR.
282  if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
283  continue;
284 
285  for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
286  LiveRange &LR = LIS->getRegUnit(*RegUnit);
287  const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
288  if (!Value)
289  continue;
290 
291  // Since we're in machine SSA, we do not need to track physical
292  // registers across basic blocks.
293  if (Value->isPHIDef())
294  continue;
295 
296  markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
297  Worklist);
298  }
299 
300  continue;
301  }
302 
303  for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
304  markInstruction(DefMI, Flag, Worklist);
305  }
306 }
307 
308 // Scan instructions to determine which ones require an Exact execmask and
309 // which ones seed WQM requirements.
310 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
311  std::vector<WorkItem> &Worklist) {
312  char GlobalFlags = 0;
313  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
314  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
315 
316  // We need to visit the basic blocks in reverse post-order so that we visit
317  // defs before uses, in particular so that we don't accidentally mark an
318  // instruction as needing e.g. WQM before visiting it and realizing it needs
319  // WQM disabled.
321  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
322  MachineBasicBlock &MBB = **BI;
323  BlockInfo &BBI = Blocks[&MBB];
324 
325  for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
326  MachineInstr &MI = *II;
327  InstrInfo &III = Instructions[&MI];
328  unsigned Opcode = MI.getOpcode();
329  char Flags = 0;
330 
331  if (TII->isWQM(Opcode)) {
332  // Sampling instructions don't need to produce results for all pixels
333  // in a quad, they just require all inputs of a quad to have been
334  // computed for derivatives.
335  markInstructionUses(MI, StateWQM, Worklist);
336  GlobalFlags |= StateWQM;
337  continue;
338  } else if (Opcode == AMDGPU::WQM) {
339  // The WQM intrinsic requires its output to have all the helper lanes
340  // correct, so we need it to be in WQM.
341  Flags = StateWQM;
342  LowerToCopyInstrs.push_back(&MI);
343  } else if (Opcode == AMDGPU::WWM) {
344  // The WWM intrinsic doesn't make the same guarantee, and plus it needs
345  // to be executed in WQM or Exact so that its copy doesn't clobber
346  // inactive lanes.
347  markInstructionUses(MI, StateWWM, Worklist);
348  GlobalFlags |= StateWWM;
349  LowerToCopyInstrs.push_back(&MI);
350  continue;
351  } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
352  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
353  III.Disabled = StateWWM;
354  MachineOperand &Inactive = MI.getOperand(2);
355  if (Inactive.isReg()) {
356  if (Inactive.isUndef()) {
357  LowerToCopyInstrs.push_back(&MI);
358  } else {
359  unsigned Reg = Inactive.getReg();
361  for (MachineInstr &DefMI : MRI->def_instructions(Reg))
362  markInstruction(DefMI, StateWWM, Worklist);
363  }
364  }
365  }
366  SetInactiveInstrs.push_back(&MI);
367  continue;
368  } else if (TII->isDisableWQM(MI)) {
369  BBI.Needs |= StateExact;
370  if (!(BBI.InNeeds & StateExact)) {
371  BBI.InNeeds |= StateExact;
372  Worklist.push_back(&MBB);
373  }
374  GlobalFlags |= StateExact;
375  III.Disabled = StateWQM | StateWWM;
376  continue;
377  } else {
378  if (Opcode == AMDGPU::SI_PS_LIVE) {
379  LiveMaskQueries.push_back(&MI);
380  } else if (WQMOutputs) {
381  // The function is in machine SSA form, which means that physical
382  // VGPRs correspond to shader inputs and outputs. Inputs are
383  // only used, outputs are only defined.
384  for (const MachineOperand &MO : MI.defs()) {
385  if (!MO.isReg())
386  continue;
387 
388  unsigned Reg = MO.getReg();
389 
390  if (!TRI->isVirtualRegister(Reg) &&
391  TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
392  Flags = StateWQM;
393  break;
394  }
395  }
396  }
397 
398  if (!Flags)
399  continue;
400  }
401 
402  markInstruction(MI, Flags, Worklist);
403  GlobalFlags |= Flags;
404  }
405  }
406 
407  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
408  // ever used anywhere in the function. This implements the corresponding
409  // semantics of @llvm.amdgcn.set.inactive.
410  if (GlobalFlags & StateWQM) {
411  for (MachineInstr *MI : SetInactiveInstrs)
412  markInstruction(*MI, StateWQM, Worklist);
413  }
414 
415  return GlobalFlags;
416 }
417 
418 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
419  std::vector<WorkItem>& Worklist) {
420  MachineBasicBlock *MBB = MI.getParent();
421  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
422  BlockInfo &BI = Blocks[MBB];
423 
424  // Control flow-type instructions and stores to temporary memory that are
425  // followed by WQM computations must themselves be in WQM.
426  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
427  (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
428  Instructions[&MI].Needs = StateWQM;
429  II.Needs = StateWQM;
430  }
431 
432  // Propagate to block level
433  if (II.Needs & StateWQM) {
434  BI.Needs |= StateWQM;
435  if (!(BI.InNeeds & StateWQM)) {
436  BI.InNeeds |= StateWQM;
437  Worklist.push_back(MBB);
438  }
439  }
440 
441  // Propagate backwards within block
442  if (MachineInstr *PrevMI = MI.getPrevNode()) {
443  char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
444  if (!PrevMI->isPHI()) {
445  InstrInfo &PrevII = Instructions[PrevMI];
446  if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
447  PrevII.OutNeeds |= InNeeds;
448  Worklist.push_back(PrevMI);
449  }
450  }
451  }
452 
453  // Propagate WQM flag to instruction inputs
454  assert(!(II.Needs & StateExact));
455 
456  if (II.Needs != 0)
457  markInstructionUses(MI, II.Needs, Worklist);
458 
459  // Ensure we process a block containing WWM, even if it does not require any
460  // WQM transitions.
461  if (II.Needs & StateWWM)
462  BI.Needs |= StateWWM;
463 }
464 
465 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
466  std::vector<WorkItem>& Worklist) {
467  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
468 
469  // Propagate through instructions
470  if (!MBB.empty()) {
471  MachineInstr *LastMI = &*MBB.rbegin();
472  InstrInfo &LastII = Instructions[LastMI];
473  if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
474  LastII.OutNeeds |= BI.OutNeeds;
475  Worklist.push_back(LastMI);
476  }
477  }
478 
479  // Predecessor blocks must provide for our WQM/Exact needs.
480  for (MachineBasicBlock *Pred : MBB.predecessors()) {
481  BlockInfo &PredBI = Blocks[Pred];
482  if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
483  continue;
484 
485  PredBI.OutNeeds |= BI.InNeeds;
486  PredBI.InNeeds |= BI.InNeeds;
487  Worklist.push_back(Pred);
488  }
489 
490  // All successors must be prepared to accept the same set of WQM/Exact data.
491  for (MachineBasicBlock *Succ : MBB.successors()) {
492  BlockInfo &SuccBI = Blocks[Succ];
493  if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
494  continue;
495 
496  SuccBI.InNeeds |= BI.OutNeeds;
497  Worklist.push_back(Succ);
498  }
499 }
500 
501 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
502  std::vector<WorkItem> Worklist;
503  char GlobalFlags = scanInstructions(MF, Worklist);
504 
505  while (!Worklist.empty()) {
506  WorkItem WI = Worklist.back();
507  Worklist.pop_back();
508 
509  if (WI.MI)
510  propagateInstruction(*WI.MI, Worklist);
511  else
512  propagateBlock(*WI.MBB, Worklist);
513  }
514 
515  return GlobalFlags;
516 }
517 
518 /// Whether \p MI really requires the exec state computed during analysis.
519 ///
520 /// Scalar instructions must occasionally be marked WQM for correct propagation
521 /// (e.g. thread masks leading up to branches), but when it comes to actual
522 /// execution, they don't care about EXEC.
523 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
524  if (MI.isTerminator())
525  return true;
526 
527  // Skip instructions that are not affected by EXEC
528  if (TII->isScalarUnit(MI))
529  return false;
530 
531  // Generic instructions such as COPY will either disappear by register
532  // coalescing or be lowered to SALU or VALU instructions.
533  if (MI.isTransient()) {
534  if (MI.getNumExplicitOperands() >= 1) {
535  const MachineOperand &Op = MI.getOperand(0);
536  if (Op.isReg()) {
537  if (TRI->isSGPRReg(*MRI, Op.getReg())) {
538  // SGPR instructions are not affected by EXEC
539  return false;
540  }
541  }
542  }
543  }
544 
545  return true;
546 }
547 
549 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
551  unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
552 
553  MachineInstr *Save =
554  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
555  .addReg(AMDGPU::SCC);
556  MachineInstr *Restore =
557  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
558  .addReg(SaveReg);
559 
560  LIS->InsertMachineInstrInMaps(*Save);
561  LIS->InsertMachineInstrInMaps(*Restore);
562  LIS->createAndComputeVirtRegInterval(SaveReg);
563 
564  return Restore;
565 }
566 
567 // Return an iterator in the (inclusive) range [First, Last] at which
568 // instructions can be safely inserted, keeping in mind that some of the
569 // instructions we want to add necessarily clobber SCC.
570 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
572  MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
573  if (!SaveSCC)
574  return PreferLast ? Last : First;
575 
576  LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
577  auto MBBE = MBB.end();
578  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
579  : LIS->getMBBEndIdx(&MBB);
580  SlotIndex LastIdx =
581  Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
582  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
583  const LiveRange::Segment *S;
584 
585  for (;;) {
586  S = LR.getSegmentContaining(Idx);
587  if (!S)
588  break;
589 
590  if (PreferLast) {
591  SlotIndex Next = S->start.getBaseIndex();
592  if (Next < FirstIdx)
593  break;
594  Idx = Next;
595  } else {
596  SlotIndex Next = S->end.getNextIndex().getBaseIndex();
597  if (Next > LastIdx)
598  break;
599  Idx = Next;
600  }
601  }
602 
604 
605  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
606  MBBI = MI;
607  else {
608  assert(Idx == LIS->getMBBEndIdx(&MBB));
609  MBBI = MBB.end();
610  }
611 
612  if (S)
613  MBBI = saveSCC(MBB, MBBI);
614 
615  return MBBI;
616 }
617 
618 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
620  unsigned SaveWQM, unsigned LiveMaskReg) {
621  MachineInstr *MI;
622 
623  if (SaveWQM) {
624  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
625  AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
626  SaveWQM)
627  .addReg(LiveMaskReg);
628  } else {
629  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
630  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
631  AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
632  Exec)
633  .addReg(Exec)
634  .addReg(LiveMaskReg);
635  }
636 
637  LIS->InsertMachineInstrInMaps(*MI);
638 }
639 
640 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
642  unsigned SavedWQM) {
643  MachineInstr *MI;
644 
645  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
646  if (SavedWQM) {
647  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
648  .addReg(SavedWQM);
649  } else {
650  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
651  AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
652  Exec)
653  .addReg(Exec);
654  }
655 
656  LIS->InsertMachineInstrInMaps(*MI);
657 }
658 
659 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
661  unsigned SaveOrig) {
662  MachineInstr *MI;
663 
664  assert(SaveOrig);
665  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
666  .addImm(-1);
667  LIS->InsertMachineInstrInMaps(*MI);
668 }
669 
670 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
672  unsigned SavedOrig) {
673  MachineInstr *MI;
674 
675  assert(SavedOrig);
676  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
677  ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
678  .addReg(SavedOrig);
679  LIS->InsertMachineInstrInMaps(*MI);
680 }
681 
682 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
683  bool isEntry) {
684  auto BII = Blocks.find(&MBB);
685  if (BII == Blocks.end())
686  return;
687 
688  const BlockInfo &BI = BII->second;
689 
690  // This is a non-entry block that is WQM throughout, so no need to do
691  // anything.
692  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
693  return;
694 
695  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
696  << ":\n");
697 
698  unsigned SavedWQMReg = 0;
699  unsigned SavedNonWWMReg = 0;
700  bool WQMFromExec = isEntry;
701  char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
702  char NonWWMState = 0;
703  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
704 
705  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
706  if (isEntry)
707  ++II; // Skip the instruction that saves LiveMask
708 
709  // This stores the first instruction where it's safe to switch from WQM to
710  // Exact or vice versa.
711  MachineBasicBlock::iterator FirstWQM = IE;
712 
713  // This stores the first instruction where it's safe to switch from WWM to
714  // Exact/WQM or to switch to WWM. It must always be the same as, or after,
715  // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
716  // switch to/from WQM as well.
717  MachineBasicBlock::iterator FirstWWM = IE;
718  for (;;) {
719  MachineBasicBlock::iterator Next = II;
720  char Needs = StateExact | StateWQM; // WWM is disabled by default
721  char OutNeeds = 0;
722 
723  if (FirstWQM == IE)
724  FirstWQM = II;
725 
726  if (FirstWWM == IE)
727  FirstWWM = II;
728 
729  // First, figure out the allowed states (Needs) based on the propagated
730  // flags.
731  if (II != IE) {
732  MachineInstr &MI = *II;
733 
734  if (requiresCorrectState(MI)) {
735  auto III = Instructions.find(&MI);
736  if (III != Instructions.end()) {
737  if (III->second.Needs & StateWWM)
738  Needs = StateWWM;
739  else if (III->second.Needs & StateWQM)
740  Needs = StateWQM;
741  else
742  Needs &= ~III->second.Disabled;
743  OutNeeds = III->second.OutNeeds;
744  }
745  } else {
746  // If the instruction doesn't actually need a correct EXEC, then we can
747  // safely leave WWM enabled.
748  Needs = StateExact | StateWQM | StateWWM;
749  }
750 
751  if (MI.isTerminator() && OutNeeds == StateExact)
752  Needs = StateExact;
753 
754  if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
755  MI.getOperand(3).setImm(1);
756 
757  ++Next;
758  } else {
759  // End of basic block
760  if (BI.OutNeeds & StateWQM)
761  Needs = StateWQM;
762  else if (BI.OutNeeds == StateExact)
763  Needs = StateExact;
764  else
765  Needs = StateWQM | StateExact;
766  }
767 
768  // Now, transition if necessary.
769  if (!(Needs & State)) {
771  if (State == StateWWM || Needs == StateWWM) {
772  // We must switch to or from WWM
773  First = FirstWWM;
774  } else {
775  // We only need to switch to/from WQM, so we can use FirstWQM
776  First = FirstWQM;
777  }
778 
780  prepareInsertion(MBB, First, II, Needs == StateWQM,
781  Needs == StateExact || WQMFromExec);
782 
783  if (State == StateWWM) {
784  assert(SavedNonWWMReg);
785  fromWWM(MBB, Before, SavedNonWWMReg);
786  State = NonWWMState;
787  }
788 
789  if (Needs == StateWWM) {
790  NonWWMState = State;
791  SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
792  toWWM(MBB, Before, SavedNonWWMReg);
793  State = StateWWM;
794  } else {
795  if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
796  if (!WQMFromExec && (OutNeeds & StateWQM))
797  SavedWQMReg = MRI->createVirtualRegister(BoolRC);
798 
799  toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
800  State = StateExact;
801  } else if (State == StateExact && (Needs & StateWQM) &&
802  !(Needs & StateExact)) {
803  assert(WQMFromExec == (SavedWQMReg == 0));
804 
805  toWQM(MBB, Before, SavedWQMReg);
806 
807  if (SavedWQMReg) {
808  LIS->createAndComputeVirtRegInterval(SavedWQMReg);
809  SavedWQMReg = 0;
810  }
811  State = StateWQM;
812  } else {
813  // We can get here if we transitioned from WWM to a non-WWM state that
814  // already matches our needs, but we shouldn't need to do anything.
815  assert(Needs & State);
816  }
817  }
818  }
819 
820  if (Needs != (StateExact | StateWQM | StateWWM)) {
821  if (Needs != (StateExact | StateWQM))
822  FirstWQM = IE;
823  FirstWWM = IE;
824  }
825 
826  if (II == IE)
827  break;
828  II = Next;
829  }
830 }
831 
832 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
833  for (MachineInstr *MI : LiveMaskQueries) {
834  const DebugLoc &DL = MI->getDebugLoc();
835  unsigned Dest = MI->getOperand(0).getReg();
836  MachineInstr *Copy =
837  BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
838  .addReg(LiveMaskReg);
839 
840  LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
841  MI->eraseFromParent();
842  }
843 }
844 
845 void SIWholeQuadMode::lowerCopyInstrs() {
846  for (MachineInstr *MI : LowerToCopyInstrs) {
847  for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
848  MI->RemoveOperand(i);
849 
850  const unsigned Reg = MI->getOperand(0).getReg();
851 
852  if (TRI->isVGPR(*MRI, Reg)) {
853  const TargetRegisterClass *regClass =
855  ? MRI->getRegClass(Reg)
856  : TRI->getPhysRegClass(Reg);
857 
858  const unsigned MovOp = TII->getMovOpcode(regClass);
859  MI->setDesc(TII->get(MovOp));
860 
861  // And make it implicitly depend on exec (like all VALU movs should do).
862  MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
863  } else {
864  MI->setDesc(TII->get(AMDGPU::COPY));
865  }
866  }
867 }
868 
869 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
870  Instructions.clear();
871  Blocks.clear();
872  LiveMaskQueries.clear();
873  LowerToCopyInstrs.clear();
875 
876  ST = &MF.getSubtarget<GCNSubtarget>();
877 
878  TII = ST->getInstrInfo();
879  TRI = &TII->getRegisterInfo();
880  MRI = &MF.getRegInfo();
881  LIS = &getAnalysis<LiveIntervals>();
882 
883  char GlobalFlags = analyzeFunction(MF);
884  unsigned LiveMaskReg = 0;
885  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
886  if (!(GlobalFlags & StateWQM)) {
887  lowerLiveMaskQueries(Exec);
888  if (!(GlobalFlags & StateWWM))
889  return !LiveMaskQueries.empty();
890  } else {
891  // Store a copy of the original live mask when required
892  MachineBasicBlock &Entry = MF.front();
894 
895  if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
896  LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
897  MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
898  TII->get(AMDGPU::COPY), LiveMaskReg)
899  .addReg(Exec);
900  LIS->InsertMachineInstrInMaps(*MI);
901  }
902 
903  lowerLiveMaskQueries(LiveMaskReg);
904 
905  if (GlobalFlags == StateWQM) {
906  // For a shader that needs only WQM, we can just set it once.
907  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
908  AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
909  Exec)
910  .addReg(Exec);
911 
912  lowerCopyInstrs();
913  // EntryMI may become invalid here
914  return true;
915  }
916  }
917 
918  LLVM_DEBUG(printInfo());
919 
920  lowerCopyInstrs();
921 
922  // Handle the general case
923  for (auto BII : Blocks)
924  processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
925 
926  // Physical registers like SCC aren't tracked by default anyway, so just
927  // removing the ranges we computed is the simplest option for maintaining
928  // the analysis results.
929  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
930 
931  return true;
932 }
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
Definition: LiveInterval.h:77
char & SIWholeQuadModeID
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
AMDGPU specific subclass of TargetSubtarget.
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:241
SlotIndex def
The index of the defining instruction.
Definition: LiveInterval.h:60
SI Whole Quad Mode
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:473
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
Definition: MachineInstr.h:494
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_END(SIWholeQuadMode
void push_back(const T &Elt)
Definition: SmallVector.h:211
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:385
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
unsigned Reg
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
bool isTransient() const
Return true if this is a transient instruction that is either very likely to be eliminated during reg...
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:161
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
VNInfo - Value Number Information.
Definition: LiveInterval.h:52
iterator_range< succ_iterator > successors()
static MachineOperand CreateReg(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:156
static const AMDGPUSubtarget & get(const MachineFunction &MF)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
SlotIndex getNextIndex() const
Returns the next index.
Definition: SlotIndexes.h:279
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
bool isTerminator(QueryType Type=AnyInBundle) const
Returns true if this instruction part of the terminator for a basic block.
Definition: MachineInstr.h:651
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:411
SlotIndexes pass.
Definition: SlotIndexes.h:314
reverse_iterator rbegin()
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:528
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:117
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:822
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
FunctionPass * createSIWholeQuadModePass()
Represent the analysis usage information of a pass.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:483
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
iterator_range< pred_iterator > predecessors()
const MachineBasicBlock & front() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
MachineInstrBuilder MachineInstrBuilder & DefMI
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
#define DEBUG_TYPE
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:256
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:2038
bool isReg() const
isReg - Tests if this is a MO_Register operand.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:395
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:72
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
void RemoveOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with...
Register getReg() const
getReg - Returns the register number.
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:416
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:83
const SIRegisterInfo * getRegisterInfo() const override