LLVM  15.0.0git
SIWholeQuadMode.cpp
Go to the documentation of this file.
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 /// S_MOV_B64 LiveMask, EXEC
25 /// S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 /// ...
32 /// S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 /// S_OR_SAVEEXEC_B64 Tmp, -1
38 /// ...
39 /// S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 /// S_MOV_B64 Tmp, EXEC
46 /// S_WQM_B64 EXEC, EXEC
47 /// ...
48 /// S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 /// (1) at the top level (outside of control flow statements, and as long as
60 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
61 /// the LiveMask (this is implemented for the entry block).
62 ///
63 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
64 /// consist of exact and don't-care instructions, the switch only has to
65 /// be done at the entry and exit points rather than potentially in each
66 /// block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
73 #include "llvm/ADT/MapVector.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92  StateWQM = 0x1,
93  StateStrictWWM = 0x2,
94  StateStrictWQM = 0x4,
95  StateExact = 0x8,
96  StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101  int State;
102 
103  explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109  static const std::pair<char, const char *> Mapping[] = {
110  std::make_pair(StateWQM, "WQM"),
111  std::make_pair(StateStrictWWM, "StrictWWM"),
112  std::make_pair(StateStrictWQM, "StrictWQM"),
113  std::make_pair(StateExact, "Exact")};
114  char State = PS.State;
115  for (auto M : Mapping) {
116  if (State & M.first) {
117  OS << M.second;
118  State &= ~M.first;
119 
120  if (State)
121  OS << '|';
122  }
123  }
124  assert(State == 0);
125  return OS;
126 }
127 #endif
128 
129 struct InstrInfo {
130  char Needs = 0;
131  char Disabled = 0;
132  char OutNeeds = 0;
133 };
134 
135 struct BlockInfo {
136  char Needs = 0;
137  char InNeeds = 0;
138  char OutNeeds = 0;
139  char InitialState = 0;
140  bool NeedsLowering = false;
141 };
142 
143 struct WorkItem {
144  MachineBasicBlock *MBB = nullptr;
145  MachineInstr *MI = nullptr;
146 
147  WorkItem() = default;
148  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
149  WorkItem(MachineInstr *MI) : MI(MI) {}
150 };
151 
152 class SIWholeQuadMode : public MachineFunctionPass {
153 private:
154  const SIInstrInfo *TII;
155  const SIRegisterInfo *TRI;
156  const GCNSubtarget *ST;
158  LiveIntervals *LIS;
161 
162  unsigned AndOpc;
163  unsigned AndN2Opc;
164  unsigned XorOpc;
165  unsigned AndSaveExecOpc;
166  unsigned OrSaveExecOpc;
167  unsigned WQMOpc;
168  Register Exec;
169  Register LiveMaskReg;
170 
173 
174  // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175  DenseMap<const MachineInstr *, char> StateTransition;
176 
177  SmallVector<MachineInstr *, 2> LiveMaskQueries;
178  SmallVector<MachineInstr *, 4> LowerToMovInstrs;
179  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
181 
182  void printInfo();
183 
184  void markInstruction(MachineInstr &MI, char Flag,
185  std::vector<WorkItem> &Worklist);
186  void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187  unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188  void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189  std::vector<WorkItem> &Worklist);
190  void markInstructionUses(const MachineInstr &MI, char Flag,
191  std::vector<WorkItem> &Worklist);
192  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195  char analyzeFunction(MachineFunction &MF);
196 
201  MachineBasicBlock::iterator Last, bool PreferLast,
202  bool SaveSCC);
203  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204  Register SaveWQM);
206  Register SavedWQM);
207  void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208  Register SaveOrig, char StrictStateNeeded);
209  void fromStrictMode(MachineBasicBlock &MBB,
210  MachineBasicBlock::iterator Before, Register SavedOrig,
211  char NonStrictState, char CurrentStrictState);
212 
214 
216  bool IsWQM);
218 
219  void lowerBlock(MachineBasicBlock &MBB);
220  void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221 
222  void lowerLiveMaskQueries();
223  void lowerCopyInstrs();
224  void lowerKillInstrs(bool IsWQM);
225 
226 public:
227  static char ID;
228 
229  SIWholeQuadMode() :
231 
232  bool runOnMachineFunction(MachineFunction &MF) override;
233 
234  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
235 
236  void getAnalysisUsage(AnalysisUsage &AU) const override {
245  }
246 
247  MachineFunctionProperties getClearedProperties() const override {
250  }
251 };
252 
253 } // end anonymous namespace
254 
255 char SIWholeQuadMode::ID = 0;
256 
257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258  false)
262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263  false)
264 
265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
266 
268  return new SIWholeQuadMode;
269 }
270 
271 #ifndef NDEBUG
272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273  for (const auto &BII : Blocks) {
274  dbgs() << "\n"
275  << printMBBReference(*BII.first) << ":\n"
276  << " InNeeds = " << PrintState(BII.second.InNeeds)
277  << ", Needs = " << PrintState(BII.second.Needs)
278  << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
279 
280  for (const MachineInstr &MI : *BII.first) {
281  auto III = Instructions.find(&MI);
282  if (III == Instructions.end())
283  continue;
284 
285  dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
286  << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
287  }
288  }
289 }
290 #endif
291 
292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293  std::vector<WorkItem> &Worklist) {
294  InstrInfo &II = Instructions[&MI];
295 
296  assert(!(Flag & StateExact) && Flag != 0);
297 
298  // Remove any disabled states from the flag. The user that required it gets
299  // an undefined value in the helper lanes. For example, this can happen if
300  // the result of an atomic is used by instruction that requires WQM, where
301  // ignoring the request for WQM is correct as per the relevant specs.
302  Flag &= ~II.Disabled;
303 
304  // Ignore if the flag is already encompassed by the existing needs, or we
305  // just disabled everything.
306  if ((II.Needs & Flag) == Flag)
307  return;
308 
309  LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310  II.Needs |= Flag;
311  Worklist.push_back(&MI);
312 }
313 
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316  Register Reg, unsigned SubReg, char Flag,
317  std::vector<WorkItem> &Worklist) {
318  LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319 
320  LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321  const VNInfo *Value = UseLRQ.valueIn();
322  if (!Value)
323  return;
324 
325  // Note: this code assumes that lane masks on AMDGPU completely
326  // cover registers.
327  const LaneBitmask UseLanes =
329  : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
331 
332  // Perform a depth-first iteration of the LiveRange graph marking defs.
333  // Stop processing of a given branch when all use lanes have been defined.
334  // The first definition stops processing for a physical register.
335  struct PhiEntry {
336  const VNInfo *Phi;
337  unsigned PredIdx;
338  LaneBitmask DefinedLanes;
339 
340  PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341  : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342  };
343  using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344  SmallVector<PhiEntry, 2> PhiStack;
345  SmallSet<VisitKey, 4> Visited;
346  LaneBitmask DefinedLanes;
347  unsigned NextPredIdx = 0; // Only used for processing phi nodes
348  do {
349  const VNInfo *NextValue = nullptr;
350  const VisitKey Key(Value, DefinedLanes);
351 
352  if (!Visited.count(Key)) {
353  Visited.insert(Key);
354  // On first visit to a phi then start processing first predecessor
355  NextPredIdx = 0;
356  }
357 
358  if (Value->isPHIDef()) {
359  // Each predecessor node in the phi must be processed as a subgraph
360  const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
361  assert(MBB && "Phi-def has no defining MBB");
362 
363  // Find next predecessor to process
364  unsigned Idx = NextPredIdx;
365  auto PI = MBB->pred_begin() + Idx;
366  auto PE = MBB->pred_end();
367  for (; PI != PE && !NextValue; ++PI, ++Idx) {
368  if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
369  if (!Visited.count(VisitKey(VN, DefinedLanes)))
370  NextValue = VN;
371  }
372  }
373 
374  // If there are more predecessors to process; add phi to stack
375  if (PI != PE)
376  PhiStack.emplace_back(Value, Idx, DefinedLanes);
377  } else {
378  MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
379  assert(MI && "Def has no defining instruction");
380 
381  if (Reg.isVirtual()) {
382  // Iterate over all operands to find relevant definitions
383  bool HasDef = false;
384  for (const MachineOperand &Op : MI->operands()) {
385  if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
386  continue;
387 
388  // Compute lanes defined and overlap with use
389  LaneBitmask OpLanes =
390  Op.isUndef() ? LaneBitmask::getAll()
391  : TRI->getSubRegIndexLaneMask(Op.getSubReg());
392  LaneBitmask Overlap = (UseLanes & OpLanes);
393 
394  // Record if this instruction defined any of use
395  HasDef |= Overlap.any();
396 
397  // Mark any lanes defined
398  DefinedLanes |= OpLanes;
399  }
400 
401  // Check if all lanes of use have been defined
402  if ((DefinedLanes & UseLanes) != UseLanes) {
403  // Definition not complete; need to process input value
404  LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
405  if (const VNInfo *VN = LRQ.valueIn()) {
406  if (!Visited.count(VisitKey(VN, DefinedLanes)))
407  NextValue = VN;
408  }
409  }
410 
411  // Only mark the instruction if it defines some part of the use
412  if (HasDef)
413  markInstruction(*MI, Flag, Worklist);
414  } else {
415  // For physical registers simply mark the defining instruction
416  markInstruction(*MI, Flag, Worklist);
417  }
418  }
419 
420  if (!NextValue && !PhiStack.empty()) {
421  // Reach end of chain; revert to processing last phi
422  PhiEntry &Entry = PhiStack.back();
423  NextValue = Entry.Phi;
424  NextPredIdx = Entry.PredIdx;
425  DefinedLanes = Entry.DefinedLanes;
426  PhiStack.pop_back();
427  }
428 
429  Value = NextValue;
430  } while (Value);
431 }
432 
433 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
434  const MachineOperand &Op, char Flag,
435  std::vector<WorkItem> &Worklist) {
436  assert(Op.isReg());
437  Register Reg = Op.getReg();
438 
439  // Ignore some hardware registers
440  switch (Reg) {
441  case AMDGPU::EXEC:
442  case AMDGPU::EXEC_LO:
443  return;
444  default:
445  break;
446  }
447 
448  LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
449  << " for " << MI);
450  if (Reg.isVirtual()) {
451  LiveRange &LR = LIS->getInterval(Reg);
452  markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
453  } else {
454  // Handle physical registers that we need to track; this is mostly relevant
455  // for VCC, which can appear as the (implicit) input of a uniform branch,
456  // e.g. when a loop counter is stored in a VGPR.
457  for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
458  ++RegUnit) {
459  LiveRange &LR = LIS->getRegUnit(*RegUnit);
460  const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
461  if (!Value)
462  continue;
463 
464  markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
465  }
466  }
467 }
468 
469 /// Mark all instructions defining the uses in \p MI with \p Flag.
470 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
471  std::vector<WorkItem> &Worklist) {
472  LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
473  << MI);
474 
475  for (const MachineOperand &Use : MI.uses()) {
476  if (!Use.isReg() || !Use.isUse())
477  continue;
478  markOperand(MI, Use, Flag, Worklist);
479  }
480 }
481 
482 // Scan instructions to determine which ones require an Exact execmask and
483 // which ones seed WQM requirements.
484 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
485  std::vector<WorkItem> &Worklist) {
486  char GlobalFlags = 0;
487  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
488  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
489  SmallVector<MachineInstr *, 4> SoftWQMInstrs;
490  bool HasImplicitDerivatives =
492 
493  // We need to visit the basic blocks in reverse post-order so that we visit
494  // defs before uses, in particular so that we don't accidentally mark an
495  // instruction as needing e.g. WQM before visiting it and realizing it needs
496  // WQM disabled.
498  for (MachineBasicBlock *MBB : RPOT) {
499  BlockInfo &BBI = Blocks[MBB];
500 
501  for (MachineInstr &MI : *MBB) {
502  InstrInfo &III = Instructions[&MI];
503  unsigned Opcode = MI.getOpcode();
504  char Flags = 0;
505 
506  if (TII->isWQM(Opcode)) {
507  // If LOD is not supported WQM is not needed.
508  if (!ST->hasExtendedImageInsts())
509  continue;
510  // Only generate implicit WQM if implicit derivatives are required.
511  // This avoids inserting unintended WQM if a shader type without
512  // implicit derivatives uses an image sampling instruction.
513  if (!HasImplicitDerivatives)
514  continue;
515  // Sampling instructions don't need to produce results for all pixels
516  // in a quad, they just require all inputs of a quad to have been
517  // computed for derivatives.
518  markInstructionUses(MI, StateWQM, Worklist);
519  GlobalFlags |= StateWQM;
520  continue;
521  } else if (Opcode == AMDGPU::WQM) {
522  // The WQM intrinsic requires its output to have all the helper lanes
523  // correct, so we need it to be in WQM.
524  Flags = StateWQM;
525  LowerToCopyInstrs.push_back(&MI);
526  } else if (Opcode == AMDGPU::SOFT_WQM) {
527  LowerToCopyInstrs.push_back(&MI);
528  SoftWQMInstrs.push_back(&MI);
529  continue;
530  } else if (Opcode == AMDGPU::STRICT_WWM) {
531  // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
532  // it needs to be executed in WQM or Exact so that its copy doesn't
533  // clobber inactive lanes.
534  markInstructionUses(MI, StateStrictWWM, Worklist);
535  GlobalFlags |= StateStrictWWM;
536  LowerToMovInstrs.push_back(&MI);
537  continue;
538  } else if (Opcode == AMDGPU::STRICT_WQM) {
539  // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
540  // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
541  // quads that have at least one active thread.
542  markInstructionUses(MI, StateStrictWQM, Worklist);
543  GlobalFlags |= StateStrictWQM;
544  LowerToMovInstrs.push_back(&MI);
545  continue;
546  } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
547  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
548  III.Disabled = StateStrict;
549  MachineOperand &Inactive = MI.getOperand(2);
550  if (Inactive.isReg()) {
551  if (Inactive.isUndef()) {
552  LowerToCopyInstrs.push_back(&MI);
553  } else {
554  markOperand(MI, Inactive, StateStrictWWM, Worklist);
555  }
556  }
557  SetInactiveInstrs.push_back(&MI);
558  continue;
559  } else if (TII->isDisableWQM(MI)) {
560  BBI.Needs |= StateExact;
561  if (!(BBI.InNeeds & StateExact)) {
562  BBI.InNeeds |= StateExact;
563  Worklist.push_back(MBB);
564  }
565  GlobalFlags |= StateExact;
566  III.Disabled = StateWQM | StateStrict;
567  continue;
568  } else {
569  if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
570  LiveMaskQueries.push_back(&MI);
571  } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
572  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
573  Opcode == AMDGPU::SI_DEMOTE_I1) {
574  KillInstrs.push_back(&MI);
575  BBI.NeedsLowering = true;
576  } else if (WQMOutputs) {
577  // The function is in machine SSA form, which means that physical
578  // VGPRs correspond to shader inputs and outputs. Inputs are
579  // only used, outputs are only defined.
580  // FIXME: is this still valid?
581  for (const MachineOperand &MO : MI.defs()) {
582  if (!MO.isReg())
583  continue;
584 
585  Register Reg = MO.getReg();
586 
587  if (!Reg.isVirtual() &&
588  TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
589  Flags = StateWQM;
590  break;
591  }
592  }
593  }
594 
595  if (!Flags)
596  continue;
597  }
598 
599  markInstruction(MI, Flags, Worklist);
600  GlobalFlags |= Flags;
601  }
602  }
603 
604  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
605  // ever used anywhere in the function. This implements the corresponding
606  // semantics of @llvm.amdgcn.set.inactive.
607  // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
608  if (GlobalFlags & StateWQM) {
609  for (MachineInstr *MI : SetInactiveInstrs)
610  markInstruction(*MI, StateWQM, Worklist);
611  for (MachineInstr *MI : SoftWQMInstrs)
612  markInstruction(*MI, StateWQM, Worklist);
613  }
614 
615  return GlobalFlags;
616 }
617 
618 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
619  std::vector<WorkItem>& Worklist) {
620  MachineBasicBlock *MBB = MI.getParent();
621  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
622  BlockInfo &BI = Blocks[MBB];
623 
624  // Control flow-type instructions and stores to temporary memory that are
625  // followed by WQM computations must themselves be in WQM.
626  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
627  (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
628  Instructions[&MI].Needs = StateWQM;
629  II.Needs = StateWQM;
630  }
631 
632  // Propagate to block level
633  if (II.Needs & StateWQM) {
634  BI.Needs |= StateWQM;
635  if (!(BI.InNeeds & StateWQM)) {
636  BI.InNeeds |= StateWQM;
637  Worklist.push_back(MBB);
638  }
639  }
640 
641  // Propagate backwards within block
642  if (MachineInstr *PrevMI = MI.getPrevNode()) {
643  char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
644  if (!PrevMI->isPHI()) {
645  InstrInfo &PrevII = Instructions[PrevMI];
646  if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
647  PrevII.OutNeeds |= InNeeds;
648  Worklist.push_back(PrevMI);
649  }
650  }
651  }
652 
653  // Propagate WQM flag to instruction inputs
654  assert(!(II.Needs & StateExact));
655 
656  if (II.Needs != 0)
657  markInstructionUses(MI, II.Needs, Worklist);
658 
659  // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
660  // not require any WQM transitions.
661  if (II.Needs & StateStrictWWM)
662  BI.Needs |= StateStrictWWM;
663  if (II.Needs & StateStrictWQM)
664  BI.Needs |= StateStrictWQM;
665 }
666 
667 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
668  std::vector<WorkItem>& Worklist) {
669  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
670 
671  // Propagate through instructions
672  if (!MBB.empty()) {
673  MachineInstr *LastMI = &*MBB.rbegin();
674  InstrInfo &LastII = Instructions[LastMI];
675  if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
676  LastII.OutNeeds |= BI.OutNeeds;
677  Worklist.push_back(LastMI);
678  }
679  }
680 
681  // Predecessor blocks must provide for our WQM/Exact needs.
682  for (MachineBasicBlock *Pred : MBB.predecessors()) {
683  BlockInfo &PredBI = Blocks[Pred];
684  if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
685  continue;
686 
687  PredBI.OutNeeds |= BI.InNeeds;
688  PredBI.InNeeds |= BI.InNeeds;
689  Worklist.push_back(Pred);
690  }
691 
692  // All successors must be prepared to accept the same set of WQM/Exact data.
693  for (MachineBasicBlock *Succ : MBB.successors()) {
694  BlockInfo &SuccBI = Blocks[Succ];
695  if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
696  continue;
697 
698  SuccBI.InNeeds |= BI.OutNeeds;
699  Worklist.push_back(Succ);
700  }
701 }
702 
703 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
704  std::vector<WorkItem> Worklist;
705  char GlobalFlags = scanInstructions(MF, Worklist);
706 
707  while (!Worklist.empty()) {
708  WorkItem WI = Worklist.back();
709  Worklist.pop_back();
710 
711  if (WI.MI)
712  propagateInstruction(*WI.MI, Worklist);
713  else
714  propagateBlock(*WI.MBB, Worklist);
715  }
716 
717  return GlobalFlags;
718 }
719 
721 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
723  Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
724 
725  MachineInstr *Save =
726  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
728  MachineInstr *Restore =
729  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
730  .addReg(SaveReg);
731 
732  LIS->InsertMachineInstrInMaps(*Save);
733  LIS->InsertMachineInstrInMaps(*Restore);
734  LIS->createAndComputeVirtRegInterval(SaveReg);
735 
736  return Restore;
737 }
738 
740  MachineInstr *TermMI) {
741  LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
742  << *TermMI << "\n");
743 
744  MachineBasicBlock *SplitBB =
745  BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
746 
747  // Convert last instruction in block to a terminator.
748  // Note: this only covers the expected patterns
749  unsigned NewOpcode = 0;
750  switch (TermMI->getOpcode()) {
751  case AMDGPU::S_AND_B32:
752  NewOpcode = AMDGPU::S_AND_B32_term;
753  break;
754  case AMDGPU::S_AND_B64:
755  NewOpcode = AMDGPU::S_AND_B64_term;
756  break;
757  case AMDGPU::S_MOV_B32:
758  NewOpcode = AMDGPU::S_MOV_B32_term;
759  break;
760  case AMDGPU::S_MOV_B64:
761  NewOpcode = AMDGPU::S_MOV_B64_term;
762  break;
763  default:
764  break;
765  }
766  if (NewOpcode)
767  TermMI->setDesc(TII->get(NewOpcode));
768 
769  if (SplitBB != BB) {
770  // Update dominator trees
771  using DomTreeT = DomTreeBase<MachineBasicBlock>;
773  for (MachineBasicBlock *Succ : SplitBB->successors()) {
774  DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
775  DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
776  }
777  DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
778  if (MDT)
779  MDT->getBase().applyUpdates(DTUpdates);
780  if (PDT)
781  PDT->getBase().applyUpdates(DTUpdates);
782 
783  // Link blocks
784  MachineInstr *MI =
785  BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
786  .addMBB(SplitBB);
787  LIS->InsertMachineInstrInMaps(*MI);
788  }
789 
790  return SplitBB;
791 }
792 
793 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
794  MachineInstr &MI) {
795  const DebugLoc &DL = MI.getDebugLoc();
796  unsigned Opcode = 0;
797 
798  assert(MI.getOperand(0).isReg());
799 
800  // Comparison is for live lanes; however here we compute the inverse
801  // (killed lanes). This is because VCMP will always generate 0 bits
802  // for inactive lanes so a mask of live lanes would not be correct
803  // inside control flow.
804  // Invert the comparison by swapping the operands and adjusting
805  // the comparison codes.
806 
807  switch (MI.getOperand(2).getImm()) {
808  case ISD::SETUEQ:
809  Opcode = AMDGPU::V_CMP_LG_F32_e64;
810  break;
811  case ISD::SETUGT:
812  Opcode = AMDGPU::V_CMP_GE_F32_e64;
813  break;
814  case ISD::SETUGE:
815  Opcode = AMDGPU::V_CMP_GT_F32_e64;
816  break;
817  case ISD::SETULT:
818  Opcode = AMDGPU::V_CMP_LE_F32_e64;
819  break;
820  case ISD::SETULE:
821  Opcode = AMDGPU::V_CMP_LT_F32_e64;
822  break;
823  case ISD::SETUNE:
824  Opcode = AMDGPU::V_CMP_EQ_F32_e64;
825  break;
826  case ISD::SETO:
827  Opcode = AMDGPU::V_CMP_O_F32_e64;
828  break;
829  case ISD::SETUO:
830  Opcode = AMDGPU::V_CMP_U_F32_e64;
831  break;
832  case ISD::SETOEQ:
833  case ISD::SETEQ:
834  Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
835  break;
836  case ISD::SETOGT:
837  case ISD::SETGT:
838  Opcode = AMDGPU::V_CMP_NLT_F32_e64;
839  break;
840  case ISD::SETOGE:
841  case ISD::SETGE:
842  Opcode = AMDGPU::V_CMP_NLE_F32_e64;
843  break;
844  case ISD::SETOLT:
845  case ISD::SETLT:
846  Opcode = AMDGPU::V_CMP_NGT_F32_e64;
847  break;
848  case ISD::SETOLE:
849  case ISD::SETLE:
850  Opcode = AMDGPU::V_CMP_NGE_F32_e64;
851  break;
852  case ISD::SETONE:
853  case ISD::SETNE:
854  Opcode = AMDGPU::V_CMP_NLG_F32_e64;
855  break;
856  default:
857  llvm_unreachable("invalid ISD:SET cond code");
858  }
859 
860  // Pick opcode based on comparison type.
861  MachineInstr *VcmpMI;
862  const MachineOperand &Op0 = MI.getOperand(0);
863  const MachineOperand &Op1 = MI.getOperand(1);
864 
865  // VCC represents lanes killed.
866  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
867 
868  if (TRI->isVGPR(*MRI, Op0.getReg())) {
869  Opcode = AMDGPU::getVOPe32(Opcode);
870  VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
871  } else {
872  VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
873  .addReg(VCC, RegState::Define)
874  .addImm(0) // src0 modifiers
875  .add(Op1)
876  .addImm(0) // src1 modifiers
877  .add(Op0)
878  .addImm(0); // omod
879  }
880 
881  MachineInstr *MaskUpdateMI =
882  BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
883  .addReg(LiveMaskReg)
884  .addReg(VCC);
885 
886  // State of SCC represents whether any lanes are live in mask,
887  // if SCC is 0 then no lanes will be alive anymore.
888  MachineInstr *EarlyTermMI =
889  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
890 
891  MachineInstr *ExecMaskMI =
892  BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
893 
894  assert(MBB.succ_size() == 1);
895  MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
896  .addMBB(*MBB.succ_begin());
897 
898  // Update live intervals
899  LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
900  MBB.remove(&MI);
901 
902  LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
903  LIS->InsertMachineInstrInMaps(*ExecMaskMI);
904  LIS->InsertMachineInstrInMaps(*EarlyTermMI);
905  LIS->InsertMachineInstrInMaps(*NewTerm);
906 
907  return NewTerm;
908 }
909 
910 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
911  MachineInstr &MI, bool IsWQM) {
912  const DebugLoc &DL = MI.getDebugLoc();
913  MachineInstr *MaskUpdateMI = nullptr;
914 
915  const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
916  const MachineOperand &Op = MI.getOperand(0);
917  int64_t KillVal = MI.getOperand(1).getImm();
918  MachineInstr *ComputeKilledMaskMI = nullptr;
919  Register CndReg = !Op.isImm() ? Op.getReg() : Register();
920  Register TmpReg;
921 
922  // Is this a static or dynamic kill?
923  if (Op.isImm()) {
924  if (Op.getImm() == KillVal) {
925  // Static: all active lanes are killed
926  MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
927  .addReg(LiveMaskReg)
928  .addReg(Exec);
929  } else {
930  // Static: kill does nothing
931  MachineInstr *NewTerm = nullptr;
932  if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
933  LIS->RemoveMachineInstrFromMaps(MI);
934  } else {
935  assert(MBB.succ_size() == 1);
936  NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
937  .addMBB(*MBB.succ_begin());
938  LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
939  }
940  MBB.remove(&MI);
941  return NewTerm;
942  }
943  } else {
944  if (!KillVal) {
945  // Op represents live lanes after kill,
946  // so exec mask needs to be factored in.
947  TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
948  ComputeKilledMaskMI =
949  BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
950  MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
951  .addReg(LiveMaskReg)
952  .addReg(TmpReg);
953  } else {
954  // Op represents lanes to kill
955  MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
956  .addReg(LiveMaskReg)
957  .add(Op);
958  }
959  }
960 
961  // State of SCC represents whether any lanes are live in mask,
962  // if SCC is 0 then no lanes will be alive anymore.
963  MachineInstr *EarlyTermMI =
964  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
965 
966  // In the case we got this far some lanes are still live,
967  // update EXEC to deactivate lanes as appropriate.
968  MachineInstr *NewTerm;
969  MachineInstr *WQMMaskMI = nullptr;
970  Register LiveMaskWQM;
971  if (IsDemote) {
972  // Demote - deactivate quads with only helper lanes
973  LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
974  WQMMaskMI =
975  BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
976  NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
977  .addReg(Exec)
978  .addReg(LiveMaskWQM);
979  } else {
980  // Kill - deactivate lanes no longer in live mask
981  if (Op.isImm()) {
982  unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
983  NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
984  } else if (!IsWQM) {
985  NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
986  .addReg(Exec)
987  .addReg(LiveMaskReg);
988  } else {
989  unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
990  NewTerm =
991  BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
992  }
993  }
994 
995  // Update live intervals
996  LIS->RemoveMachineInstrFromMaps(MI);
997  MBB.remove(&MI);
998  assert(EarlyTermMI);
999  assert(MaskUpdateMI);
1000  assert(NewTerm);
1001  if (ComputeKilledMaskMI)
1002  LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1003  LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1004  LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1005  if (WQMMaskMI)
1006  LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1007  LIS->InsertMachineInstrInMaps(*NewTerm);
1008 
1009  if (CndReg) {
1010  LIS->removeInterval(CndReg);
1011  LIS->createAndComputeVirtRegInterval(CndReg);
1012  }
1013  if (TmpReg)
1014  LIS->createAndComputeVirtRegInterval(TmpReg);
1015  if (LiveMaskWQM)
1016  LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1017 
1018  return NewTerm;
1019 }
1020 
1021 // Replace (or supplement) instructions accessing live mask.
1022 // This can only happen once all the live mask registers have been created
1023 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1024 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1025  auto BII = Blocks.find(&MBB);
1026  if (BII == Blocks.end())
1027  return;
1028 
1029  const BlockInfo &BI = BII->second;
1030  if (!BI.NeedsLowering)
1031  return;
1032 
1033  LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1034 
1035  SmallVector<MachineInstr *, 4> SplitPoints;
1036  char State = BI.InitialState;
1037 
1040  if (StateTransition.count(&MI))
1041  State = StateTransition[&MI];
1042 
1043  MachineInstr *SplitPoint = nullptr;
1044  switch (MI.getOpcode()) {
1045  case AMDGPU::SI_DEMOTE_I1:
1046  case AMDGPU::SI_KILL_I1_TERMINATOR:
1047  SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1048  break;
1049  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1050  SplitPoint = lowerKillF32(MBB, MI);
1051  break;
1052  default:
1053  break;
1054  }
1055  if (SplitPoint)
1056  SplitPoints.push_back(SplitPoint);
1057  }
1058 
1059  // Perform splitting after instruction scan to simplify iteration.
1060  if (!SplitPoints.empty()) {
1061  MachineBasicBlock *BB = &MBB;
1062  for (MachineInstr *MI : SplitPoints) {
1063  BB = splitBlock(BB, MI);
1064  }
1065  }
1066 }
1067 
1068 // Return an iterator in the (inclusive) range [First, Last] at which
1069 // instructions can be safely inserted, keeping in mind that some of the
1070 // instructions we want to add necessarily clobber SCC.
1071 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1073  MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1074  if (!SaveSCC)
1075  return PreferLast ? Last : First;
1076 
1077  LiveRange &LR =
1078  LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1079  auto MBBE = MBB.end();
1080  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1081  : LIS->getMBBEndIdx(&MBB);
1082  SlotIndex LastIdx =
1083  Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1084  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1085  const LiveRange::Segment *S;
1086 
1087  for (;;) {
1088  S = LR.getSegmentContaining(Idx);
1089  if (!S)
1090  break;
1091 
1092  if (PreferLast) {
1093  SlotIndex Next = S->start.getBaseIndex();
1094  if (Next < FirstIdx)
1095  break;
1096  Idx = Next;
1097  } else {
1098  MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1099  assert(EndMI && "Segment does not end on valid instruction");
1100  auto NextI = std::next(EndMI->getIterator());
1101  if (NextI == MBB.end())
1102  break;
1103  SlotIndex Next = LIS->getInstructionIndex(*NextI);
1104  if (Next > LastIdx)
1105  break;
1106  Idx = Next;
1107  }
1108  }
1109 
1111 
1112  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1113  MBBI = MI;
1114  else {
1115  assert(Idx == LIS->getMBBEndIdx(&MBB));
1116  MBBI = MBB.end();
1117  }
1118 
1119  // Move insertion point past any operations modifying EXEC.
1120  // This assumes that the value of SCC defined by any of these operations
1121  // does not need to be preserved.
1122  while (MBBI != Last) {
1123  bool IsExecDef = false;
1124  for (const MachineOperand &MO : MBBI->operands()) {
1125  if (MO.isReg() && MO.isDef()) {
1126  IsExecDef |=
1127  MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1128  }
1129  }
1130  if (!IsExecDef)
1131  break;
1132  MBBI++;
1133  S = nullptr;
1134  }
1135 
1136  if (S)
1137  MBBI = saveSCC(MBB, MBBI);
1138 
1139  return MBBI;
1140 }
1141 
1142 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1144  Register SaveWQM) {
1145  MachineInstr *MI;
1146 
1147  if (SaveWQM) {
1148  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1149  .addReg(LiveMaskReg);
1150  } else {
1151  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1152  .addReg(Exec)
1153  .addReg(LiveMaskReg);
1154  }
1155 
1156  LIS->InsertMachineInstrInMaps(*MI);
1157  StateTransition[MI] = StateExact;
1158 }
1159 
1160 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1162  Register SavedWQM) {
1163  MachineInstr *MI;
1164 
1165  if (SavedWQM) {
1166  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1167  .addReg(SavedWQM);
1168  } else {
1169  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1170  }
1171 
1172  LIS->InsertMachineInstrInMaps(*MI);
1173  StateTransition[MI] = StateWQM;
1174 }
1175 
1176 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1178  Register SaveOrig, char StrictStateNeeded) {
1179  MachineInstr *MI;
1180  assert(SaveOrig);
1181  assert(StrictStateNeeded == StateStrictWWM ||
1182  StrictStateNeeded == StateStrictWQM);
1183 
1184  if (StrictStateNeeded == StateStrictWWM) {
1185  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1186  SaveOrig)
1187  .addImm(-1);
1188  } else {
1189  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1190  SaveOrig)
1191  .addImm(-1);
1192  }
1193  LIS->InsertMachineInstrInMaps(*MI);
1194  StateTransition[MI] = StateStrictWWM;
1195 }
1196 
1197 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1199  Register SavedOrig, char NonStrictState,
1200  char CurrentStrictState) {
1201  MachineInstr *MI;
1202 
1203  assert(SavedOrig);
1204  assert(CurrentStrictState == StateStrictWWM ||
1205  CurrentStrictState == StateStrictWQM);
1206 
1207  if (CurrentStrictState == StateStrictWWM) {
1208  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1209  Exec)
1210  .addReg(SavedOrig);
1211  } else {
1212  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1213  Exec)
1214  .addReg(SavedOrig);
1215  }
1216  LIS->InsertMachineInstrInMaps(*MI);
1217  StateTransition[MI] = NonStrictState;
1218 }
1219 
1220 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1221  auto BII = Blocks.find(&MBB);
1222  if (BII == Blocks.end())
1223  return;
1224 
1225  BlockInfo &BI = BII->second;
1226 
1227  // This is a non-entry block that is WQM throughout, so no need to do
1228  // anything.
1229  if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1230  BI.InitialState = StateWQM;
1231  return;
1232  }
1233 
1234  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1235  << ":\n");
1236 
1237  Register SavedWQMReg;
1238  Register SavedNonStrictReg;
1239  bool WQMFromExec = IsEntry;
1240  char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1241  char NonStrictState = 0;
1242  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1243 
1244  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1245  if (IsEntry) {
1246  // Skip the instruction that saves LiveMask
1247  if (II != IE && II->getOpcode() == AMDGPU::COPY)
1248  ++II;
1249  }
1250 
1251  // This stores the first instruction where it's safe to switch from WQM to
1252  // Exact or vice versa.
1253  MachineBasicBlock::iterator FirstWQM = IE;
1254 
1255  // This stores the first instruction where it's safe to switch from Strict
1256  // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1257  // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1258  // be safe to switch to/from WQM as well.
1259  MachineBasicBlock::iterator FirstStrict = IE;
1260 
1261  // Record initial state is block information.
1262  BI.InitialState = State;
1263 
1264  for (;;) {
1265  MachineBasicBlock::iterator Next = II;
1266  char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1267  char OutNeeds = 0;
1268 
1269  if (FirstWQM == IE)
1270  FirstWQM = II;
1271 
1272  if (FirstStrict == IE)
1273  FirstStrict = II;
1274 
1275  // First, figure out the allowed states (Needs) based on the propagated
1276  // flags.
1277  if (II != IE) {
1278  MachineInstr &MI = *II;
1279 
1280  if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1281  auto III = Instructions.find(&MI);
1282  if (III != Instructions.end()) {
1283  if (III->second.Needs & StateStrictWWM)
1284  Needs = StateStrictWWM;
1285  else if (III->second.Needs & StateStrictWQM)
1286  Needs = StateStrictWQM;
1287  else if (III->second.Needs & StateWQM)
1288  Needs = StateWQM;
1289  else
1290  Needs &= ~III->second.Disabled;
1291  OutNeeds = III->second.OutNeeds;
1292  }
1293  } else {
1294  // If the instruction doesn't actually need a correct EXEC, then we can
1295  // safely leave Strict mode enabled.
1296  Needs = StateExact | StateWQM | StateStrict;
1297  }
1298 
1299  if (MI.isTerminator() && OutNeeds == StateExact)
1300  Needs = StateExact;
1301 
1302  ++Next;
1303  } else {
1304  // End of basic block
1305  if (BI.OutNeeds & StateWQM)
1306  Needs = StateWQM;
1307  else if (BI.OutNeeds == StateExact)
1308  Needs = StateExact;
1309  else
1310  Needs = StateWQM | StateExact;
1311  }
1312 
1313  // Now, transition if necessary.
1314  if (!(Needs & State)) {
1316  if (State == StateStrictWWM || Needs == StateStrictWWM ||
1317  State == StateStrictWQM || Needs == StateStrictWQM) {
1318  // We must switch to or from Strict mode.
1319  First = FirstStrict;
1320  } else {
1321  // We only need to switch to/from WQM, so we can use FirstWQM.
1322  First = FirstWQM;
1323  }
1324 
1325  // Whether we need to save SCC depends on start and end states.
1326  bool SaveSCC = false;
1327  switch (State) {
1328  case StateExact:
1329  case StateStrictWWM:
1330  case StateStrictWQM:
1331  // Exact/Strict -> Strict: save SCC
1332  // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1333  // Exact/Strict -> Exact: no save
1334  SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1335  break;
1336  case StateWQM:
1337  // WQM -> Exact/Strict: save SCC
1338  SaveSCC = !(Needs & StateWQM);
1339  break;
1340  default:
1341  llvm_unreachable("Unknown state");
1342  break;
1343  }
1345  prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1346 
1347  if (State & StateStrict) {
1348  assert(State == StateStrictWWM || State == StateStrictWQM);
1349  assert(SavedNonStrictReg);
1350  fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1351 
1352  LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1353  SavedNonStrictReg = 0;
1354  State = NonStrictState;
1355  }
1356 
1357  if (Needs & StateStrict) {
1358  NonStrictState = State;
1359  assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1360  assert(!SavedNonStrictReg);
1361  SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1362 
1363  toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1364  State = Needs;
1365 
1366  } else {
1367  if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1368  if (!WQMFromExec && (OutNeeds & StateWQM)) {
1369  assert(!SavedWQMReg);
1370  SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1371  }
1372 
1373  toExact(MBB, Before, SavedWQMReg);
1374  State = StateExact;
1375  } else if (State == StateExact && (Needs & StateWQM) &&
1376  !(Needs & StateExact)) {
1377  assert(WQMFromExec == (SavedWQMReg == 0));
1378 
1379  toWQM(MBB, Before, SavedWQMReg);
1380 
1381  if (SavedWQMReg) {
1382  LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1383  SavedWQMReg = 0;
1384  }
1385  State = StateWQM;
1386  } else {
1387  // We can get here if we transitioned from StrictWWM to a
1388  // non-StrictWWM state that already matches our needs, but we
1389  // shouldn't need to do anything.
1390  assert(Needs & State);
1391  }
1392  }
1393  }
1394 
1395  if (Needs != (StateExact | StateWQM | StateStrict)) {
1396  if (Needs != (StateExact | StateWQM))
1397  FirstWQM = IE;
1398  FirstStrict = IE;
1399  }
1400 
1401  if (II == IE)
1402  break;
1403 
1404  II = Next;
1405  }
1406  assert(!SavedWQMReg);
1407  assert(!SavedNonStrictReg);
1408 }
1409 
1410 void SIWholeQuadMode::lowerLiveMaskQueries() {
1411  for (MachineInstr *MI : LiveMaskQueries) {
1412  const DebugLoc &DL = MI->getDebugLoc();
1413  Register Dest = MI->getOperand(0).getReg();
1414 
1415  MachineInstr *Copy =
1416  BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1417  .addReg(LiveMaskReg);
1418 
1419  LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1420  MI->eraseFromParent();
1421  }
1422 }
1423 
1424 void SIWholeQuadMode::lowerCopyInstrs() {
1425  for (MachineInstr *MI : LowerToMovInstrs) {
1426  assert(MI->getNumExplicitOperands() == 2);
1427 
1428  const Register Reg = MI->getOperand(0).getReg();
1429  const unsigned SubReg = MI->getOperand(0).getSubReg();
1430 
1431  if (TRI->isVGPR(*MRI, Reg)) {
1432  const TargetRegisterClass *regClass =
1433  Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
1434  if (SubReg)
1435  regClass = TRI->getSubRegClass(regClass, SubReg);
1436 
1437  const unsigned MovOp = TII->getMovOpcode(regClass);
1438  MI->setDesc(TII->get(MovOp));
1439 
1440  // Check that it already implicitly depends on exec (like all VALU movs
1441  // should do).
1442  assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1443  return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1444  }));
1445  } else {
1446  // Remove early-clobber and exec dependency from simple SGPR copies.
1447  // This allows some to be eliminated during/post RA.
1448  LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1449  if (MI->getOperand(0).isEarlyClobber()) {
1450  LIS->removeInterval(Reg);
1451  MI->getOperand(0).setIsEarlyClobber(false);
1452  LIS->createAndComputeVirtRegInterval(Reg);
1453  }
1454  int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1455  while (Index >= 0) {
1456  MI->removeOperand(Index);
1457  Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1458  }
1459  MI->setDesc(TII->get(AMDGPU::COPY));
1460  LLVM_DEBUG(dbgs() << " -> " << *MI);
1461  }
1462  }
1463  for (MachineInstr *MI : LowerToCopyInstrs) {
1464  if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1465  MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1466  assert(MI->getNumExplicitOperands() == 3);
1467  // the only reason we should be here is V_SET_INACTIVE has
1468  // an undef input so it is being replaced by a simple copy.
1469  // There should be a second undef source that we should remove.
1470  assert(MI->getOperand(2).isUndef());
1471  MI->removeOperand(2);
1472  MI->untieRegOperand(1);
1473  } else {
1474  assert(MI->getNumExplicitOperands() == 2);
1475  }
1476 
1477  MI->setDesc(TII->get(AMDGPU::COPY));
1478  }
1479 }
1480 
1481 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1482  for (MachineInstr *MI : KillInstrs) {
1483  MachineBasicBlock *MBB = MI->getParent();
1484  MachineInstr *SplitPoint = nullptr;
1485  switch (MI->getOpcode()) {
1486  case AMDGPU::SI_DEMOTE_I1:
1487  case AMDGPU::SI_KILL_I1_TERMINATOR:
1488  SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1489  break;
1490  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1491  SplitPoint = lowerKillF32(*MBB, *MI);
1492  break;
1493  default:
1494  continue;
1495  }
1496  if (SplitPoint)
1497  splitBlock(MBB, SplitPoint);
1498  }
1499 }
1500 
1501 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1502  LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1503  << " ------------- \n");
1504  LLVM_DEBUG(MF.dump(););
1505 
1506  Instructions.clear();
1507  Blocks.clear();
1508  LiveMaskQueries.clear();
1509  LowerToCopyInstrs.clear();
1510  LowerToMovInstrs.clear();
1511  KillInstrs.clear();
1512  StateTransition.clear();
1513 
1514  ST = &MF.getSubtarget<GCNSubtarget>();
1515 
1516  TII = ST->getInstrInfo();
1517  TRI = &TII->getRegisterInfo();
1518  MRI = &MF.getRegInfo();
1519  LIS = &getAnalysis<LiveIntervals>();
1520  MDT = &getAnalysis<MachineDominatorTree>();
1521  PDT = &getAnalysis<MachinePostDominatorTree>();
1522 
1523  if (ST->isWave32()) {
1524  AndOpc = AMDGPU::S_AND_B32;
1525  AndN2Opc = AMDGPU::S_ANDN2_B32;
1526  XorOpc = AMDGPU::S_XOR_B32;
1527  AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1528  OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1529  WQMOpc = AMDGPU::S_WQM_B32;
1530  Exec = AMDGPU::EXEC_LO;
1531  } else {
1532  AndOpc = AMDGPU::S_AND_B64;
1533  AndN2Opc = AMDGPU::S_ANDN2_B64;
1534  XorOpc = AMDGPU::S_XOR_B64;
1535  AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1536  OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1537  WQMOpc = AMDGPU::S_WQM_B64;
1538  Exec = AMDGPU::EXEC;
1539  }
1540 
1541  const char GlobalFlags = analyzeFunction(MF);
1542  const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1543 
1544  LiveMaskReg = Exec;
1545 
1546  // Shader is simple does not need any state changes or any complex lowering
1547  if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1548  LowerToMovInstrs.empty() && KillInstrs.empty()) {
1549  lowerLiveMaskQueries();
1550  return !LiveMaskQueries.empty();
1551  }
1552 
1553  MachineBasicBlock &Entry = MF.front();
1554  MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1555 
1556  // Store a copy of the original live mask when required
1557  if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1558  LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1559  MachineInstr *MI =
1560  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1561  .addReg(Exec);
1562  LIS->InsertMachineInstrInMaps(*MI);
1563  }
1564 
1565  LLVM_DEBUG(printInfo());
1566 
1567  lowerLiveMaskQueries();
1568  lowerCopyInstrs();
1569 
1570  // Shader only needs WQM
1571  if (GlobalFlags == StateWQM) {
1572  auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1573  .addReg(Exec);
1574  LIS->InsertMachineInstrInMaps(*MI);
1575  lowerKillInstrs(true);
1576  } else {
1577  for (auto BII : Blocks)
1578  processBlock(*BII.first, BII.first == &Entry);
1579  // Lowering blocks causes block splitting so perform as a second pass.
1580  for (auto BII : Blocks)
1581  lowerBlock(*BII.first);
1582  }
1583 
1584  // Compute live range for live mask
1585  if (LiveMaskReg != Exec)
1586  LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1587 
1588  // Physical registers like SCC aren't tracked by default anyway, so just
1589  // removing the ranges we computed is the simplest option for maintaining
1590  // the analysis results.
1591  LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1592 
1593  // If we performed any kills then recompute EXEC
1594  if (!KillInstrs.empty())
1595  LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1596 
1597  return true;
1598 }
llvm::LaneBitmask
Definition: LaneBitmask.h:40
llvm::ISD::SETUGE
@ SETUGE
Definition: ISDOpcodes.h:1417
llvm::MachineBasicBlock::succ_size
unsigned succ_size() const
Definition: MachineBasicBlock.h:353
llvm::ISD::SETLE
@ SETLE
Definition: ISDOpcodes.h:1428
llvm::ISD::SETO
@ SETO
Definition: ISDOpcodes.h:1413
llvm::MachineBasicBlock::pred_begin
pred_iterator pred_begin()
Definition: MachineBasicBlock.h:325
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:104
MachineInstr.h
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:494
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::SIInstrFlags::WQM
@ WQM
Definition: SIDefines.h:77
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::TailPredication::Disabled
@ Disabled
Definition: ARMTargetTransformInfo.h:43
UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:103
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::ISD::SETGT
@ SETGT
Definition: ISDOpcodes.h:1425
llvm::ISD::SETNE
@ SETNE
Definition: ISDOpcodes.h:1429
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:156
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
llvm::MCRegister::from
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:67
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
llvm::ISD::SETEQ
@ SETEQ
Definition: ISDOpcodes.h:1424
llvm::printMBBReference
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Definition: MachineBasicBlock.cpp:116
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
MapVector.h
llvm::LiveRange::Segment
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:162
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::createSIWholeQuadModePass
FunctionPass * createSIWholeQuadModePass()
Definition: SIWholeQuadMode.cpp:267
MachineBasicBlock.h
llvm::ISD::SETULE
@ SETULE
Definition: ISDOpcodes.h:1419
Instructions
Code Generation Notes for reduce the size of the ISel and reduce repetition in the implementation In a small number of this can cause even when no optimisation has taken place Instructions
Definition: MSA.txt:11
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:136
llvm::MachineFunctionProperties::Property::IsSSA
@ IsSSA
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:111
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
splitBlock
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
Definition: SILateBranchLowering.cpp:98
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::ISD::SETOEQ
@ SETOEQ
Definition: ISDOpcodes.h:1407
llvm::LiveQueryResult
Result of a LiveRange query.
Definition: LiveInterval.h:90
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1620
llvm::ISD::SETUEQ
@ SETUEQ
Definition: ISDOpcodes.h:1415
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:103
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:834
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:650
llvm::MachineBasicBlock::remove
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
Definition: MachineBasicBlock.h:952
llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:146
llvm::TargetRegisterInfo::getSubRegIndexLaneMask
LaneBitmask getSubRegIndexLaneMask(unsigned SubIdx) const
Return a bitmask representing the parts of a register that are covered by SubIdx.
Definition: TargetRegisterInfo.h:378
GCNSubtarget.h
llvm::ISD::SETGE
@ SETGE
Definition: ISDOpcodes.h:1426
llvm::LaneBitmask::getNone
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:83
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::AMDGPU::PALMD::Key
Key
PAL metadata keys.
Definition: AMDGPUMetadata.h:486
llvm::AMDGPU::getVOPe32
LLVM_READONLY int getVOPe32(uint16_t Opcode)
false
Definition: StackSlotColoring.cpp:141
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:127
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:180
llvm::MCID::Flag
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:147
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:54
llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:230
llvm::SlotIndexes
SlotIndexes pass.
Definition: SlotIndexes.h:313
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::ISD::SETOLT
@ SETOLT
Definition: ISDOpcodes.h:1410
llvm::LiveQueryResult::valueIn
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
Definition: LiveInterval.h:105
llvm::SlotIndex
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:82
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::ISD::SETOLE
@ SETOLE
Definition: ISDOpcodes.h:1411
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::ISD::SETUGT
@ SETUGT
Definition: ISDOpcodes.h:1416
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::MachineRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
Definition: MachineRegisterInfo.h:642
llvm::ARM_PROC::IE
@ IE
Definition: ARMBaseInfo.h:27
llvm::ISD::SETUNE
@ SETUNE
Definition: ISDOpcodes.h:1420
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:640
llvm::Function::hasFnAttribute
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:625
llvm::MachineOperand::isUndef
bool isUndef() const
Definition: MachineOperand.h:394
llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
llvm::LiveRange::getVNInfoBefore
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarilly including Idx,...
Definition: LiveInterval.h:429
AMDGPUMCTargetDesc.h
llvm::MachineBasicBlock::pred_end
pred_iterator pred_end()
Definition: MachineBasicBlock.h:327
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
LiveIntervals.h
llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:238
llvm::LiveRange
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::DenseMap
Definition: DenseMap.h:716
llvm::codeview::FrameCookieKind::Copy
@ Copy
llvm::MachineFunction::dump
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
Definition: MachineFunction.cpp:562
llvm::LaneBitmask::any
constexpr bool any() const
Definition: LaneBitmask.h:53
llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:608
llvm::MachineBasicBlock::getFirstNonPHI
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: MachineBasicBlock.cpp:196
llvm::LiveRange::Query
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:541
MachineFunctionPass.h
llvm::MachineFunction::getName
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Definition: MachineFunction.cpp:567
llvm::ISD::SETOGT
@ SETOGT
Definition: ISDOpcodes.h:1408
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineBasicBlock::succ_begin
succ_iterator succ_begin()
Definition: MachineBasicBlock.h:341
llvm::ISD::SETULT
@ SETULT
Definition: ISDOpcodes.h:1418
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::IndexedInstrProf::HashT::Last
@ Last
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
MachinePostDominators.h
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
llvm::MachineBasicBlock::predecessors
iterator_range< pred_iterator > predecessors()
Definition: MachineBasicBlock.h:358
llvm::MachineFunction
Definition: MachineFunction.h:241
llvm::AMDGPU::CPol::SCC
@ SCC
Definition: SIDefines.h:303
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1614
llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:210
llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition: MachineBasicBlock.h:364
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::MachineBasicBlock::rbegin
reverse_iterator rbegin()
Definition: MachineBasicBlock.h:281
AMDGPU.h
MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition: AArch64SLSHardening.cpp:75
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:491
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::DominatorTreeBase
Core dominator tree base class.
Definition: LoopInfo.h:65
llvm::SmallSet::insert
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
llvm::SIWholeQuadModeID
char & SIWholeQuadModeID
Definition: SIWholeQuadMode.cpp:265
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
CallingConv.h
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::ISD::SETLT
@ SETLT
Definition: ISDOpcodes.h:1427
llvm::MachineRegisterInfo::getMaxLaneMaskForVReg
LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
Definition: MachineRegisterInfo.cpp:489
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:606
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:344
llvm::RegState::Define
@ Define
Register definition.
Definition: MachineInstrBuilder.h:44
llvm::ISD::SETUO
@ SETUO
Definition: ISDOpcodes.h:1414
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_END(SIWholeQuadMode
llvm::LiveIntervals
Definition: LiveIntervals.h:54
llvm::VNInfo
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:291
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIWholeQuadMode.cpp:87
llvm::MachineInstr::setDesc
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
Definition: MachineInstr.h:1720
llvm::SIInstrInfo
Definition: SIInstrInfo.h:43
llvm::MCRegUnitIterator
Definition: MCRegisterInfo.h:680
llvm::ISD::SETOGE
@ SETOGE
Definition: ISDOpcodes.h:1409
PostOrderIterator.h
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::MCRegisterInfo::DiffListIterator::isValid
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
Definition: MCRegisterInfo.h:224
llvm::LiveRange::getSegmentContaining
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:408
llvm::MachineBasicBlock::empty
bool empty() const
Definition: MachineBasicBlock.h:249
llvm::ISD::SETONE
@ SETONE
Definition: ISDOpcodes.h:1412
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
raw_ostream.h
llvm::MachineDominatorTree
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
Definition: MachineDominators.h:51
llvm::MachineInstrBundleIterator< MachineInstr >
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:279
llvm::LaneBitmask::getAll
static constexpr LaneBitmask getAll()
Definition: LaneBitmask.h:84
SubReg
unsigned SubReg
Definition: AArch64AdvSIMDScalarPass.cpp:104
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
MachineDominators.h
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:927
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37