Line data Source code
1 : //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// This pass adds instructions to enable whole quad mode for pixel
12 : /// shaders, and whole wavefront mode for all programs.
13 : ///
14 : /// Whole quad mode is required for derivative computations, but it interferes
15 : /// with shader side effects (stores and atomics). This pass is run on the
16 : /// scheduled machine IR but before register coalescing, so that machine SSA is
17 : /// available for analysis. It ensures that WQM is enabled when necessary, but
18 : /// disabled around stores and atomics.
19 : ///
20 : /// When necessary, this pass creates a function prolog
21 : ///
22 : /// S_MOV_B64 LiveMask, EXEC
23 : /// S_WQM_B64 EXEC, EXEC
24 : ///
25 : /// to enter WQM at the top of the function and surrounds blocks of Exact
26 : /// instructions by
27 : ///
28 : /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
29 : /// ...
30 : /// S_MOV_B64 EXEC, Tmp
31 : ///
32 : /// We also compute when a sequence of instructions requires Whole Wavefront
33 : /// Mode (WWM) and insert instructions to save and restore it:
34 : ///
35 : /// S_OR_SAVEEXEC_B64 Tmp, -1
36 : /// ...
37 : /// S_MOV_B64 EXEC, Tmp
38 : ///
39 : /// In order to avoid excessive switching during sequences of Exact
40 : /// instructions, the pass first analyzes which instructions must be run in WQM
41 : /// (aka which instructions produce values that lead to derivative
42 : /// computations).
43 : ///
44 : /// Basic blocks are always exited in WQM as long as some successor needs WQM.
45 : ///
46 : /// There is room for improvement given better control flow analysis:
47 : ///
48 : /// (1) at the top level (outside of control flow statements, and as long as
49 : /// kill hasn't been used), one SGPR can be saved by recovering WQM from
50 : /// the LiveMask (this is implemented for the entry block).
51 : ///
52 : /// (2) when entire regions (e.g. if-else blocks or entire loops) only
53 : /// consist of exact and don't-care instructions, the switch only has to
54 : /// be done at the entry and exit points rather than potentially in each
55 : /// block of the region.
56 : ///
57 : //===----------------------------------------------------------------------===//
58 :
59 : #include "AMDGPU.h"
60 : #include "AMDGPUSubtarget.h"
61 : #include "SIInstrInfo.h"
62 : #include "SIMachineFunctionInfo.h"
63 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
64 : #include "llvm/ADT/DenseMap.h"
65 : #include "llvm/ADT/PostOrderIterator.h"
66 : #include "llvm/ADT/SmallVector.h"
67 : #include "llvm/ADT/StringRef.h"
68 : #include "llvm/CodeGen/LiveInterval.h"
69 : #include "llvm/CodeGen/LiveIntervals.h"
70 : #include "llvm/CodeGen/MachineBasicBlock.h"
71 : #include "llvm/CodeGen/MachineFunction.h"
72 : #include "llvm/CodeGen/MachineFunctionPass.h"
73 : #include "llvm/CodeGen/MachineInstr.h"
74 : #include "llvm/CodeGen/MachineInstrBuilder.h"
75 : #include "llvm/CodeGen/MachineOperand.h"
76 : #include "llvm/CodeGen/MachineRegisterInfo.h"
77 : #include "llvm/CodeGen/SlotIndexes.h"
78 : #include "llvm/CodeGen/TargetRegisterInfo.h"
79 : #include "llvm/IR/CallingConv.h"
80 : #include "llvm/IR/DebugLoc.h"
81 : #include "llvm/MC/MCRegisterInfo.h"
82 : #include "llvm/Pass.h"
83 : #include "llvm/Support/Debug.h"
84 : #include "llvm/Support/raw_ostream.h"
85 : #include <cassert>
86 : #include <vector>
87 :
88 : using namespace llvm;
89 :
90 : #define DEBUG_TYPE "si-wqm"
91 :
92 : namespace {
93 :
94 : enum {
95 : StateWQM = 0x1,
96 : StateWWM = 0x2,
97 : StateExact = 0x4,
98 : };
99 :
100 : struct PrintState {
101 : public:
102 : int State;
103 :
104 : explicit PrintState(int State) : State(State) {}
105 : };
106 :
107 : #ifndef NDEBUG
108 : static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
109 : if (PS.State & StateWQM)
110 : OS << "WQM";
111 : if (PS.State & StateWWM) {
112 : if (PS.State & StateWQM)
113 : OS << '|';
114 : OS << "WWM";
115 : }
116 : if (PS.State & StateExact) {
117 : if (PS.State & (StateWQM | StateWWM))
118 : OS << '|';
119 : OS << "Exact";
120 : }
121 :
122 : return OS;
123 : }
124 : #endif
125 :
126 : struct InstrInfo {
127 : char Needs = 0;
128 : char Disabled = 0;
129 : char OutNeeds = 0;
130 : };
131 :
132 : struct BlockInfo {
133 : char Needs = 0;
134 : char InNeeds = 0;
135 : char OutNeeds = 0;
136 : };
137 :
138 : struct WorkItem {
139 : MachineBasicBlock *MBB = nullptr;
140 : MachineInstr *MI = nullptr;
141 :
142 : WorkItem() = default;
143 3230 : WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
144 17374 : WorkItem(MachineInstr *MI) : MI(MI) {}
145 : };
146 :
147 : class SIWholeQuadMode : public MachineFunctionPass {
148 : private:
149 : CallingConv::ID CallingConv;
150 : const SIInstrInfo *TII;
151 : const SIRegisterInfo *TRI;
152 : MachineRegisterInfo *MRI;
153 : LiveIntervals *LIS;
154 :
155 : DenseMap<const MachineInstr *, InstrInfo> Instructions;
156 : DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
157 : SmallVector<MachineInstr *, 1> LiveMaskQueries;
158 : SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
159 :
160 : void printInfo();
161 :
162 : void markInstruction(MachineInstr &MI, char Flag,
163 : std::vector<WorkItem> &Worklist);
164 : void markInstructionUses(const MachineInstr &MI, char Flag,
165 : std::vector<WorkItem> &Worklist);
166 : char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
167 : void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
168 : void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
169 : char analyzeFunction(MachineFunction &MF);
170 :
171 : bool requiresCorrectState(const MachineInstr &MI) const;
172 :
173 : MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
174 : MachineBasicBlock::iterator Before);
175 : MachineBasicBlock::iterator
176 : prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
177 : MachineBasicBlock::iterator Last, bool PreferLast,
178 : bool SaveSCC);
179 : void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
180 : unsigned SaveWQM, unsigned LiveMaskReg);
181 : void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
182 : unsigned SavedWQM);
183 : void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
184 : unsigned SaveOrig);
185 : void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
186 : unsigned SavedOrig);
187 : void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
188 :
189 : void lowerLiveMaskQueries(unsigned LiveMaskReg);
190 : void lowerCopyInstrs();
191 :
192 : public:
193 : static char ID;
194 :
195 1965 : SIWholeQuadMode() :
196 1965 : MachineFunctionPass(ID) { }
197 :
198 : bool runOnMachineFunction(MachineFunction &MF) override;
199 :
200 1954 : StringRef getPassName() const override { return "SI Whole Quad Mode"; }
201 :
202 1954 : void getAnalysisUsage(AnalysisUsage &AU) const override {
203 : AU.addRequired<LiveIntervals>();
204 1954 : AU.setPreservesCFG();
205 1954 : MachineFunctionPass::getAnalysisUsage(AU);
206 1954 : }
207 : };
208 :
209 : } // end anonymous namespace
210 :
211 : char SIWholeQuadMode::ID = 0;
212 :
213 85105 : INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
214 : false)
215 85105 : INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
216 199024 : INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
217 : false)
218 :
219 : char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
220 :
221 1964 : FunctionPass *llvm::createSIWholeQuadModePass() {
222 1964 : return new SIWholeQuadMode;
223 : }
224 :
225 : #ifndef NDEBUG
226 : LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
227 : for (const auto &BII : Blocks) {
228 : dbgs() << "\n"
229 : << printMBBReference(*BII.first) << ":\n"
230 : << " InNeeds = " << PrintState(BII.second.InNeeds)
231 : << ", Needs = " << PrintState(BII.second.Needs)
232 : << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
233 :
234 : for (const MachineInstr &MI : *BII.first) {
235 : auto III = Instructions.find(&MI);
236 : if (III == Instructions.end())
237 : continue;
238 :
239 : dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
240 : << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
241 : }
242 : }
243 : }
244 : #endif
245 :
246 11843 : void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
247 : std::vector<WorkItem> &Worklist) {
248 11843 : InstrInfo &II = Instructions[&MI];
249 :
250 : assert(!(Flag & StateExact) && Flag != 0);
251 :
252 : // Remove any disabled states from the flag. The user that required it gets
253 : // an undefined value in the helper lanes. For example, this can happen if
254 : // the result of an atomic is used by instruction that requires WQM, where
255 : // ignoring the request for WQM is correct as per the relevant specs.
256 11843 : Flag &= ~II.Disabled;
257 :
258 : // Ignore if the flag is already encompassed by the existing needs, or we
259 : // just disabled everything.
260 11843 : if ((II.Needs & Flag) == Flag)
261 : return;
262 :
263 5755 : II.Needs |= Flag;
264 5755 : Worklist.push_back(&MI);
265 : }
266 :
267 : /// Mark all instructions defining the uses in \p MI with \p Flag.
268 11113 : void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
269 : std::vector<WorkItem> &Worklist) {
270 46345 : for (const MachineOperand &Use : MI.uses()) {
271 35232 : if (!Use.isReg() || !Use.isUse())
272 : continue;
273 :
274 21763 : unsigned Reg = Use.getReg();
275 :
276 : // Handle physical registers that we need to track; this is mostly relevant
277 : // for VCC, which can appear as the (implicit) input of a uniform branch,
278 : // e.g. when a loop counter is stored in a VGPR.
279 21763 : if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
280 9956 : if (Reg == AMDGPU::EXEC)
281 : continue;
282 :
283 18888 : for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
284 12720 : LiveRange &LR = LIS->getRegUnit(*RegUnit);
285 6360 : const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
286 6360 : if (!Value)
287 : continue;
288 :
289 : // Since we're in machine SSA, we do not need to track physical
290 : // registers across basic blocks.
291 5989 : if (Value->isPHIDef())
292 : continue;
293 :
294 30 : markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
295 : Worklist);
296 : }
297 :
298 6264 : continue;
299 : }
300 :
301 35421 : for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
302 11807 : markInstruction(DefMI, Flag, Worklist);
303 : }
304 11113 : }
305 :
306 : // Scan instructions to determine which ones require an Exact execmask and
307 : // which ones seed WQM requirements.
308 19730 : char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
309 : std::vector<WorkItem> &Worklist) {
310 : char GlobalFlags = 0;
311 19730 : bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
312 : SmallVector<MachineInstr *, 4> SetInactiveInstrs;
313 :
314 : // We need to visit the basic blocks in reverse post-order so that we visit
315 : // defs before uses, in particular so that we don't accidentally mark an
316 : // instruction as needing e.g. WQM before visiting it and realizing it needs
317 : // WQM disabled.
318 : ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
319 42065 : for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
320 22335 : MachineBasicBlock &MBB = **BI;
321 22335 : BlockInfo &BBI = Blocks[&MBB];
322 :
323 452290 : for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
324 : MachineInstr &MI = *II;
325 429955 : InstrInfo &III = Instructions[&MI];
326 429955 : unsigned Opcode = MI.getOpcode();
327 : char Flags = 0;
328 :
329 859910 : if (TII->isWQM(Opcode)) {
330 : // Sampling instructions don't need to produce results for all pixels
331 : // in a quad, they just require all inputs of a quad to have been
332 : // computed for derivatives.
333 296 : markInstructionUses(MI, StateWQM, Worklist);
334 296 : GlobalFlags |= StateWQM;
335 296 : continue;
336 429659 : } else if (Opcode == AMDGPU::WQM) {
337 : // The WQM intrinsic requires its output to have all the helper lanes
338 : // correct, so we need it to be in WQM.
339 : Flags = StateWQM;
340 13 : LowerToCopyInstrs.push_back(&MI);
341 429646 : } else if (Opcode == AMDGPU::WWM) {
342 : // The WWM intrinsic doesn't make the same guarantee, and plus it needs
343 : // to be executed in WQM or Exact so that its copy doesn't clobber
344 : // inactive lanes.
345 279 : markInstructionUses(MI, StateWWM, Worklist);
346 279 : GlobalFlags |= StateWWM;
347 279 : LowerToCopyInstrs.push_back(&MI);
348 279 : continue;
349 429367 : } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
350 : Opcode == AMDGPU::V_SET_INACTIVE_B64) {
351 30 : III.Disabled = StateWWM;
352 30 : MachineOperand &Inactive = MI.getOperand(2);
353 30 : if (Inactive.isReg()) {
354 2 : if (Inactive.isUndef()) {
355 0 : LowerToCopyInstrs.push_back(&MI);
356 : } else {
357 2 : unsigned Reg = Inactive.getReg();
358 2 : if (TargetRegisterInfo::isVirtualRegister(Reg)) {
359 6 : for (MachineInstr &DefMI : MRI->def_instructions(Reg))
360 2 : markInstruction(DefMI, StateWWM, Worklist);
361 : }
362 : }
363 : }
364 30 : SetInactiveInstrs.push_back(&MI);
365 30 : continue;
366 429337 : } else if (TII->isDisableWQM(MI)) {
367 2677 : BBI.Needs |= StateExact;
368 2677 : if (!(BBI.InNeeds & StateExact)) {
369 2181 : BBI.InNeeds |= StateExact;
370 2181 : Worklist.push_back(&MBB);
371 : }
372 2677 : GlobalFlags |= StateExact;
373 2677 : III.Disabled = StateWQM | StateWWM;
374 2677 : continue;
375 : } else {
376 426660 : if (Opcode == AMDGPU::SI_PS_LIVE) {
377 3 : LiveMaskQueries.push_back(&MI);
378 426657 : } else if (WQMOutputs) {
379 : // The function is in machine SSA form, which means that physical
380 : // VGPRs correspond to shader inputs and outputs. Inputs are
381 : // only used, outputs are only defined.
382 14 : for (const MachineOperand &MO : MI.defs()) {
383 8 : if (!MO.isReg())
384 : continue;
385 :
386 8 : unsigned Reg = MO.getReg();
387 :
388 10 : if (!TRI->isVirtualRegister(Reg) &&
389 2 : TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
390 : Flags = StateWQM;
391 : break;
392 : }
393 : }
394 : }
395 :
396 8 : if (!Flags)
397 426658 : continue;
398 : }
399 :
400 15 : markInstruction(MI, Flags, Worklist);
401 15 : GlobalFlags |= Flags;
402 : }
403 : }
404 :
405 : // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
406 : // ever used anywhere in the function. This implements the corresponding
407 : // semantics of @llvm.amdgcn.set.inactive.
408 19730 : if (GlobalFlags & StateWQM) {
409 272 : for (MachineInstr *MI : SetInactiveInstrs)
410 4 : markInstruction(*MI, StateWQM, Worklist);
411 : }
412 :
413 19730 : return GlobalFlags;
414 : }
415 :
416 17374 : void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
417 : std::vector<WorkItem>& Worklist) {
418 17374 : MachineBasicBlock *MBB = MI.getParent();
419 17374 : InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
420 17374 : BlockInfo &BI = Blocks[MBB];
421 :
422 : // Control flow-type instructions and stores to temporary memory that are
423 : // followed by WQM computations must themselves be in WQM.
424 26102 : if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
425 8587 : (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
426 143 : Instructions[&MI].Needs = StateWQM;
427 : II.Needs = StateWQM;
428 : }
429 :
430 : // Propagate to block level
431 17374 : if (II.Needs & StateWQM) {
432 9482 : BI.Needs |= StateWQM;
433 9482 : if (!(BI.InNeeds & StateWQM)) {
434 306 : BI.InNeeds |= StateWQM;
435 306 : Worklist.push_back(MBB);
436 : }
437 : }
438 :
439 : // Propagate backwards within block
440 16321 : if (MachineInstr *PrevMI = MI.getPrevNode()) {
441 16321 : char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
442 : if (!PrevMI->isPHI()) {
443 16154 : InstrInfo &PrevII = Instructions[PrevMI];
444 16154 : if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
445 11013 : PrevII.OutNeeds |= InNeeds;
446 11013 : Worklist.push_back(PrevMI);
447 : }
448 : }
449 : }
450 :
451 : // Propagate WQM flag to instruction inputs
452 : assert(!(II.Needs & StateExact));
453 :
454 17374 : if (II.Needs != 0)
455 10538 : markInstructionUses(MI, II.Needs, Worklist);
456 :
457 : // Ensure we process a block containing WWM, even if it does not require any
458 : // WQM transitions.
459 17374 : if (II.Needs & StateWWM)
460 1084 : BI.Needs |= StateWWM;
461 17374 : }
462 :
463 3230 : void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
464 : std::vector<WorkItem>& Worklist) {
465 3230 : BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
466 :
467 : // Propagate through instructions
468 3230 : if (!MBB.empty()) {
469 : MachineInstr *LastMI = &*MBB.rbegin();
470 3220 : InstrInfo &LastII = Instructions[LastMI];
471 3220 : if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
472 606 : LastII.OutNeeds |= BI.OutNeeds;
473 606 : Worklist.push_back(LastMI);
474 : }
475 : }
476 :
477 : // Predecessor blocks must provide for our WQM/Exact needs.
478 4312 : for (MachineBasicBlock *Pred : MBB.predecessors()) {
479 : BlockInfo &PredBI = Blocks[Pred];
480 1082 : if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
481 : continue;
482 :
483 619 : PredBI.OutNeeds |= BI.InNeeds;
484 619 : PredBI.InNeeds |= BI.InNeeds;
485 619 : Worklist.push_back(Pred);
486 : }
487 :
488 : // All successors must be prepared to accept the same set of WQM/Exact data.
489 4406 : for (MachineBasicBlock *Succ : MBB.successors()) {
490 : BlockInfo &SuccBI = Blocks[Succ];
491 1176 : if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
492 : continue;
493 :
494 124 : SuccBI.InNeeds |= BI.OutNeeds;
495 124 : Worklist.push_back(Succ);
496 : }
497 3230 : }
498 :
499 19730 : char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
500 : std::vector<WorkItem> Worklist;
501 19730 : char GlobalFlags = scanInstructions(MF, Worklist);
502 :
503 40334 : while (!Worklist.empty()) {
504 20604 : WorkItem WI = Worklist.back();
505 : Worklist.pop_back();
506 :
507 20604 : if (WI.MI)
508 17374 : propagateInstruction(*WI.MI, Worklist);
509 : else
510 3230 : propagateBlock(*WI.MBB, Worklist);
511 : }
512 :
513 19730 : return GlobalFlags;
514 : }
515 :
516 : /// Whether \p MI really requires the exec state computed during analysis.
517 : ///
518 : /// Scalar instructions must occasionally be marked WQM for correct propagation
519 : /// (e.g. thread masks leading up to branches), but when it comes to actual
520 : /// execution, they don't care about EXEC.
521 0 : bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
522 0 : if (MI.isTerminator())
523 0 : return true;
524 :
525 : // Skip instructions that are not affected by EXEC
526 0 : if (TII->isScalarUnit(MI))
527 0 : return false;
528 :
529 : // Generic instructions such as COPY will either disappear by register
530 : // coalescing or be lowered to SALU or VALU instructions.
531 : if (MI.isTransient()) {
532 0 : if (MI.getNumExplicitOperands() >= 1) {
533 0 : const MachineOperand &Op = MI.getOperand(0);
534 0 : if (Op.isReg()) {
535 0 : if (TRI->isSGPRReg(*MRI, Op.getReg())) {
536 : // SGPR instructions are not affected by EXEC
537 0 : return false;
538 : }
539 : }
540 : }
541 : }
542 :
543 : return true;
544 : }
545 :
546 : MachineBasicBlock::iterator
547 0 : SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
548 : MachineBasicBlock::iterator Before) {
549 0 : unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
550 :
551 : MachineInstr *Save =
552 0 : BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
553 0 : .addReg(AMDGPU::SCC);
554 : MachineInstr *Restore =
555 0 : BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
556 0 : .addReg(SaveReg);
557 :
558 0 : LIS->InsertMachineInstrInMaps(*Save);
559 0 : LIS->InsertMachineInstrInMaps(*Restore);
560 0 : LIS->createAndComputeVirtRegInterval(SaveReg);
561 :
562 0 : return Restore;
563 : }
564 :
565 : // Return an iterator in the (inclusive) range [First, Last] at which
566 : // instructions can be safely inserted, keeping in mind that some of the
567 : // instructions we want to add necessarily clobber SCC.
568 660 : MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
569 : MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
570 : MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
571 660 : if (!SaveSCC)
572 54 : return PreferLast ? Last : First;
573 :
574 630 : LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
575 : auto MBBE = MBB.end();
576 630 : SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
577 630 : : LIS->getMBBEndIdx(&MBB);
578 : SlotIndex LastIdx =
579 630 : Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
580 984 : SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
581 : const LiveRange::Segment *S;
582 :
583 : for (;;) {
584 630 : S = LR.getSegmentContaining(Idx);
585 2 : if (!S)
586 : break;
587 :
588 2 : if (PreferLast) {
589 : SlotIndex Next = S->start.getBaseIndex();
590 2 : if (Next < FirstIdx)
591 : break;
592 : Idx = Next;
593 : } else {
594 : SlotIndex Next = S->end.getNextIndex().getBaseIndex();
595 0 : if (Next > LastIdx)
596 : break;
597 : Idx = Next;
598 : }
599 : }
600 :
601 : MachineBasicBlock::iterator MBBI;
602 :
603 630 : if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
604 : MBBI = MI;
605 : else {
606 : assert(Idx == LIS->getMBBEndIdx(&MBB));
607 : MBBI = MBB.end();
608 : }
609 :
610 630 : if (S)
611 0 : MBBI = saveSCC(MBB, MBBI);
612 :
613 630 : return MBBI;
614 : }
615 :
616 0 : void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
617 : MachineBasicBlock::iterator Before,
618 : unsigned SaveWQM, unsigned LiveMaskReg) {
619 : MachineInstr *MI;
620 :
621 0 : if (SaveWQM) {
622 0 : MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
623 0 : SaveWQM)
624 0 : .addReg(LiveMaskReg);
625 : } else {
626 0 : MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
627 0 : AMDGPU::EXEC)
628 0 : .addReg(AMDGPU::EXEC)
629 0 : .addReg(LiveMaskReg);
630 : }
631 :
632 0 : LIS->InsertMachineInstrInMaps(*MI);
633 0 : }
634 :
635 0 : void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
636 : MachineBasicBlock::iterator Before,
637 : unsigned SavedWQM) {
638 : MachineInstr *MI;
639 :
640 0 : if (SavedWQM) {
641 0 : MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
642 0 : .addReg(SavedWQM);
643 : } else {
644 0 : MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
645 0 : AMDGPU::EXEC)
646 0 : .addReg(AMDGPU::EXEC);
647 : }
648 :
649 0 : LIS->InsertMachineInstrInMaps(*MI);
650 0 : }
651 :
652 0 : void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
653 : MachineBasicBlock::iterator Before,
654 : unsigned SaveOrig) {
655 : MachineInstr *MI;
656 :
657 : assert(SaveOrig);
658 0 : MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
659 0 : SaveOrig)
660 : .addImm(-1);
661 0 : LIS->InsertMachineInstrInMaps(*MI);
662 0 : }
663 :
664 0 : void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
665 : MachineBasicBlock::iterator Before,
666 : unsigned SavedOrig) {
667 : MachineInstr *MI;
668 :
669 : assert(SavedOrig);
670 0 : MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
671 0 : .addReg(SavedOrig);
672 0 : LIS->InsertMachineInstrInMaps(*MI);
673 0 : }
674 :
675 527 : void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
676 : bool isEntry) {
677 527 : auto BII = Blocks.find(&MBB);
678 527 : if (BII == Blocks.end())
679 54 : return;
680 :
681 : const BlockInfo &BI = BII->second;
682 :
683 : // This is a non-entry block that is WQM throughout, so no need to do
684 : // anything.
685 527 : if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
686 : return;
687 :
688 : LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
689 : << ":\n");
690 :
691 : unsigned SavedWQMReg = 0;
692 : unsigned SavedNonWWMReg = 0;
693 : bool WQMFromExec = isEntry;
694 473 : char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
695 : char NonWWMState = 0;
696 :
697 473 : auto II = MBB.getFirstNonPHI(), IE = MBB.end();
698 473 : if (isEntry)
699 : ++II; // Skip the instruction that saves LiveMask
700 :
701 : // This stores the first instruction where it's safe to switch from WQM to
702 : // Exact or vice versa.
703 : MachineBasicBlock::iterator FirstWQM = IE;
704 :
705 : // This stores the first instruction where it's safe to switch from WWM to
706 : // Exact/WQM or to switch to WWM. It must always be the same as, or after,
707 : // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
708 : // switch to/from WQM as well.
709 : MachineBasicBlock::iterator FirstWWM = IE;
710 : for (;;) {
711 : MachineBasicBlock::iterator Next = II;
712 : char Needs = StateExact | StateWQM; // WWM is disabled by default
713 : char OutNeeds = 0;
714 :
715 10670 : if (FirstWQM == IE)
716 : FirstWQM = II;
717 :
718 10670 : if (FirstWWM == IE)
719 : FirstWWM = II;
720 :
721 : // First, figure out the allowed states (Needs) based on the propagated
722 : // flags.
723 10670 : if (II != IE) {
724 : MachineInstr &MI = *II;
725 :
726 10197 : if (requiresCorrectState(MI)) {
727 6162 : auto III = Instructions.find(&MI);
728 6162 : if (III != Instructions.end()) {
729 6162 : if (III->second.Needs & StateWWM)
730 : Needs = StateWWM;
731 5602 : else if (III->second.Needs & StateWQM)
732 : Needs = StateWQM;
733 : else
734 3897 : Needs &= ~III->second.Disabled;
735 6162 : OutNeeds = III->second.OutNeeds;
736 : }
737 : } else {
738 : // If the instruction doesn't actually need a correct EXEC, then we can
739 : // safely leave WWM enabled.
740 : Needs = StateExact | StateWQM | StateWWM;
741 : }
742 :
743 10197 : if (MI.isTerminator() && OutNeeds == StateExact)
744 : Needs = StateExact;
745 :
746 20394 : if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
747 7 : MI.getOperand(3).setImm(1);
748 :
749 : ++Next;
750 : } else {
751 : // End of basic block
752 473 : if (BI.OutNeeds & StateWQM)
753 : Needs = StateWQM;
754 428 : else if (BI.OutNeeds == StateExact)
755 : Needs = StateExact;
756 : else
757 : Needs = StateWQM | StateExact;
758 : }
759 :
760 : // Now, transition if necessary.
761 10670 : if (!(Needs & State)) {
762 : MachineBasicBlock::iterator First;
763 660 : if (State == StateWWM || Needs == StateWWM) {
764 : // We must switch to or from WWM
765 90 : First = FirstWWM;
766 : } else {
767 : // We only need to switch to/from WQM, so we can use FirstWQM
768 570 : First = FirstWQM;
769 : }
770 :
771 : MachineBasicBlock::iterator Before =
772 : prepareInsertion(MBB, First, II, Needs == StateWQM,
773 660 : Needs == StateExact || WQMFromExec);
774 :
775 660 : if (State == StateWWM) {
776 : assert(SavedNonWWMReg);
777 45 : fromWWM(MBB, Before, SavedNonWWMReg);
778 : State = NonWWMState;
779 : }
780 :
781 660 : if (Needs == StateWWM) {
782 : NonWWMState = State;
783 90 : SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
784 45 : toWWM(MBB, Before, SavedNonWWMReg);
785 : State = StateWWM;
786 : } else {
787 615 : if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
788 290 : if (!WQMFromExec && (OutNeeds & StateWQM))
789 12 : SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
790 :
791 290 : toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
792 290 : State = StateExact;
793 325 : } else if (State == StateExact && (Needs & StateWQM) &&
794 : !(Needs & StateExact)) {
795 : assert(WQMFromExec == (SavedWQMReg == 0));
796 :
797 282 : toWQM(MBB, Before, SavedWQMReg);
798 :
799 282 : if (SavedWQMReg) {
800 6 : LIS->createAndComputeVirtRegInterval(SavedWQMReg);
801 : SavedWQMReg = 0;
802 : }
803 : State = StateWQM;
804 : } else {
805 : // We can get here if we transitioned from WWM to a non-WWM state that
806 : // already matches our needs, but we shouldn't need to do anything.
807 : assert(Needs & State);
808 : }
809 : }
810 : }
811 :
812 10670 : if (Needs != (StateExact | StateWQM | StateWWM)) {
813 6635 : if (Needs != (StateExact | StateWQM))
814 : FirstWQM = IE;
815 : FirstWWM = IE;
816 : }
817 :
818 10670 : if (II == IE)
819 : break;
820 : II = Next;
821 : }
822 : }
823 :
824 19730 : void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
825 19733 : for (MachineInstr *MI : LiveMaskQueries) {
826 : const DebugLoc &DL = MI->getDebugLoc();
827 3 : unsigned Dest = MI->getOperand(0).getReg();
828 : MachineInstr *Copy =
829 6 : BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
830 3 : .addReg(LiveMaskReg);
831 :
832 3 : LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
833 3 : MI->eraseFromParent();
834 : }
835 19730 : }
836 :
837 303 : void SIWholeQuadMode::lowerCopyInstrs() {
838 595 : for (MachineInstr *MI : LowerToCopyInstrs) {
839 292 : for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
840 0 : MI->RemoveOperand(i);
841 292 : MI->setDesc(TII->get(AMDGPU::COPY));
842 : }
843 303 : }
844 :
845 19730 : bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
846 19730 : Instructions.clear();
847 19730 : Blocks.clear();
848 : LiveMaskQueries.clear();
849 : LowerToCopyInstrs.clear();
850 19730 : CallingConv = MF.getFunction().getCallingConv();
851 :
852 19730 : const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
853 :
854 19730 : TII = ST.getInstrInfo();
855 19730 : TRI = &TII->getRegisterInfo();
856 19730 : MRI = &MF.getRegInfo();
857 19730 : LIS = &getAnalysis<LiveIntervals>();
858 :
859 19730 : char GlobalFlags = analyzeFunction(MF);
860 : unsigned LiveMaskReg = 0;
861 19730 : if (!(GlobalFlags & StateWQM)) {
862 19462 : lowerLiveMaskQueries(AMDGPU::EXEC);
863 19462 : if (!(GlobalFlags & StateWWM))
864 19427 : return !LiveMaskQueries.empty();
865 : } else {
866 : // Store a copy of the original live mask when required
867 : MachineBasicBlock &Entry = MF.front();
868 268 : MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
869 :
870 268 : if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
871 524 : LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
872 786 : MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
873 524 : TII->get(AMDGPU::COPY), LiveMaskReg)
874 262 : .addReg(AMDGPU::EXEC);
875 262 : LIS->InsertMachineInstrInMaps(*MI);
876 : }
877 :
878 268 : lowerLiveMaskQueries(LiveMaskReg);
879 :
880 268 : if (GlobalFlags == StateWQM) {
881 : // For a shader that needs only WQM, we can just set it once.
882 6 : BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
883 12 : AMDGPU::EXEC)
884 6 : .addReg(AMDGPU::EXEC);
885 :
886 6 : lowerCopyInstrs();
887 : // EntryMI may become invalid here
888 6 : return true;
889 : }
890 : }
891 :
892 : LLVM_DEBUG(printInfo());
893 :
894 297 : lowerCopyInstrs();
895 :
896 : // Handle the general case
897 824 : for (auto BII : Blocks)
898 527 : processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
899 :
900 : // Physical registers like SCC aren't tracked by default anyway, so just
901 : // removing the ranges we computed is the simplest option for maintaining
902 : // the analysis results.
903 297 : LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
904 :
905 297 : return true;
906 : }
|