LLVM 20.0.0git
SIWholeQuadMode.cpp
Go to the documentation of this file.
1//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass adds instructions to enable whole quad mode (strict or non-strict)
11/// for pixel shaders, and strict whole wavefront mode for all programs.
12///
13/// The "strict" prefix indicates that inactive lanes do not take part in
14/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15/// always be enabled irrespective of control flow decisions. Conversely in
16/// non-strict WQM inactive lanes may control flow decisions.
17///
18/// Whole quad mode is required for derivative computations, but it interferes
19/// with shader side effects (stores and atomics). It ensures that WQM is
20/// enabled when necessary, but disabled around stores and atomics.
21///
22/// When necessary, this pass creates a function prolog
23///
24/// S_MOV_B64 LiveMask, EXEC
25/// S_WQM_B64 EXEC, EXEC
26///
27/// to enter WQM at the top of the function and surrounds blocks of Exact
28/// instructions by
29///
30/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31/// ...
32/// S_MOV_B64 EXEC, Tmp
33///
34/// We also compute when a sequence of instructions requires strict whole
35/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36///
37/// S_OR_SAVEEXEC_B64 Tmp, -1
38/// ...
39/// S_MOV_B64 EXEC, Tmp
40///
41/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42/// we use a similar save and restore mechanism and force whole quad mode for
43/// those instructions:
44///
45/// S_MOV_B64 Tmp, EXEC
46/// S_WQM_B64 EXEC, EXEC
47/// ...
48/// S_MOV_B64 EXEC, Tmp
49///
50/// In order to avoid excessive switching during sequences of Exact
51/// instructions, the pass first analyzes which instructions must be run in WQM
52/// (aka which instructions produce values that lead to derivative
53/// computations).
54///
55/// Basic blocks are always exited in WQM as long as some successor needs WQM.
56///
57/// There is room for improvement given better control flow analysis:
58///
59/// (1) at the top level (outside of control flow statements, and as long as
60/// kill hasn't been used), one SGPR can be saved by recovering WQM from
61/// the LiveMask (this is implemented for the entry block).
62///
63/// (2) when entire regions (e.g. if-else blocks or entire loops) only
64/// consist of exact and don't-care instructions, the switch only has to
65/// be done at the entry and exit points rather than potentially in each
66/// block of the region.
67///
68//===----------------------------------------------------------------------===//
69
70#include "AMDGPU.h"
71#include "GCNSubtarget.h"
73#include "llvm/ADT/MapVector.h"
81#include "llvm/IR/CallingConv.h"
84
85using namespace llvm;
86
87#define DEBUG_TYPE "si-wqm"
88
89namespace {
90
91enum {
92 StateWQM = 0x1,
93 StateStrictWWM = 0x2,
94 StateStrictWQM = 0x4,
95 StateExact = 0x8,
96 StateStrict = StateStrictWWM | StateStrictWQM,
97};
98
99struct PrintState {
100public:
101 int State;
102
103 explicit PrintState(int State) : State(State) {}
104};
105
106#ifndef NDEBUG
107static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108
109 static const std::pair<char, const char *> Mapping[] = {
110 std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111 std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112 char State = PS.State;
113 for (auto M : Mapping) {
114 if (State & M.first) {
115 OS << M.second;
116 State &= ~M.first;
117
118 if (State)
119 OS << '|';
120 }
121 }
122 assert(State == 0);
123 return OS;
124}
125#endif
126
127struct InstrInfo {
128 char Needs = 0;
129 char Disabled = 0;
130 char OutNeeds = 0;
131 char MarkedStates = 0;
132};
133
134struct BlockInfo {
135 char Needs = 0;
136 char InNeeds = 0;
137 char OutNeeds = 0;
138 char InitialState = 0;
139 bool NeedsLowering = false;
140};
141
142struct WorkItem {
143 MachineBasicBlock *MBB = nullptr;
144 MachineInstr *MI = nullptr;
145
146 WorkItem() = default;
149};
150
151class SIWholeQuadMode : public MachineFunctionPass {
152private:
153 const SIInstrInfo *TII;
154 const SIRegisterInfo *TRI;
155 const GCNSubtarget *ST;
157 LiveIntervals *LIS;
160
161 unsigned AndOpc;
162 unsigned AndTermOpc;
163 unsigned AndN2Opc;
164 unsigned XorOpc;
165 unsigned AndSaveExecOpc;
166 unsigned AndSaveExecTermOpc;
167 unsigned WQMOpc;
168 Register Exec;
169 Register LiveMaskReg;
170
173
174 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
176
177 SmallVector<MachineInstr *, 2> LiveMaskQueries;
178 SmallVector<MachineInstr *, 4> LowerToMovInstrs;
179 SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
181 SmallVector<MachineInstr *, 4> InitExecInstrs;
182 SmallVector<MachineInstr *, 4> SetInactiveInstrs;
183
184 void printInfo();
185
186 void markInstruction(MachineInstr &MI, char Flag,
187 std::vector<WorkItem> &Worklist);
188 void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
189 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
190 void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
191 std::vector<WorkItem> &Worklist);
192 void markInstructionUses(const MachineInstr &MI, char Flag,
193 std::vector<WorkItem> &Worklist);
194 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
195 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
196 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
198
203 MachineBasicBlock::iterator Last, bool PreferLast,
204 bool SaveSCC);
206 Register SaveWQM);
208 Register SavedWQM);
210 Register SaveOrig, char StrictStateNeeded);
211 void fromStrictMode(MachineBasicBlock &MBB,
213 char NonStrictState, char CurrentStrictState);
214
216
218 bool IsWQM);
220
221 void lowerBlock(MachineBasicBlock &MBB);
222 void processBlock(MachineBasicBlock &MBB, bool IsEntry);
223
224 bool lowerLiveMaskQueries();
225 bool lowerCopyInstrs();
226 bool lowerKillInstrs(bool IsWQM);
227 void lowerInitExec(MachineInstr &MI);
228 MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
229 bool &Changed);
230
231public:
232 static char ID;
233
234 SIWholeQuadMode() :
236
237 bool runOnMachineFunction(MachineFunction &MF) override;
238
239 StringRef getPassName() const override { return "SI Whole Quad Mode"; }
240
241 void getAnalysisUsage(AnalysisUsage &AU) const override {
248 }
249
252 MachineFunctionProperties::Property::IsSSA);
253 }
254};
255
256} // end anonymous namespace
257
258char SIWholeQuadMode::ID = 0;
259
260INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
261 false)
265INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
266 false)
267
268char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
269
271 return new SIWholeQuadMode;
272}
273
274#ifndef NDEBUG
275LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
276 for (const auto &BII : Blocks) {
277 dbgs() << "\n"
278 << printMBBReference(*BII.first) << ":\n"
279 << " InNeeds = " << PrintState(BII.second.InNeeds)
280 << ", Needs = " << PrintState(BII.second.Needs)
281 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
282
283 for (const MachineInstr &MI : *BII.first) {
284 auto III = Instructions.find(&MI);
285 if (III != Instructions.end()) {
286 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
287 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
288 }
289 }
290 }
291}
292#endif
293
294void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
295 std::vector<WorkItem> &Worklist) {
296 InstrInfo &II = Instructions[&MI];
297
298 assert(!(Flag & StateExact) && Flag != 0);
299
300 // Capture all states requested in marking including disabled ones.
301 II.MarkedStates |= Flag;
302
303 // Remove any disabled states from the flag. The user that required it gets
304 // an undefined value in the helper lanes. For example, this can happen if
305 // the result of an atomic is used by instruction that requires WQM, where
306 // ignoring the request for WQM is correct as per the relevant specs.
307 Flag &= ~II.Disabled;
308
309 // Ignore if the flag is already encompassed by the existing needs, or we
310 // just disabled everything.
311 if ((II.Needs & Flag) == Flag)
312 return;
313
314 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
315 II.Needs |= Flag;
316 Worklist.emplace_back(&MI);
317}
318
319/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
320void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
321 Register Reg, unsigned SubReg, char Flag,
322 std::vector<WorkItem> &Worklist) {
323 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
324
325 LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
326 const VNInfo *Value = UseLRQ.valueIn();
327 if (!Value)
328 return;
329
330 // Note: this code assumes that lane masks on AMDGPU completely
331 // cover registers.
332 const LaneBitmask UseLanes =
333 SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
334 : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
336
337 // Perform a depth-first iteration of the LiveRange graph marking defs.
338 // Stop processing of a given branch when all use lanes have been defined.
339 // The first definition stops processing for a physical register.
340 struct PhiEntry {
341 const VNInfo *Phi;
342 unsigned PredIdx;
343 LaneBitmask DefinedLanes;
344
345 PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
346 : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
347 };
348 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
350 SmallSet<VisitKey, 4> Visited;
351 LaneBitmask DefinedLanes;
352 unsigned NextPredIdx = 0; // Only used for processing phi nodes
353 do {
354 const VNInfo *NextValue = nullptr;
355 const VisitKey Key(Value, DefinedLanes);
356
357 if (Visited.insert(Key).second) {
358 // On first visit to a phi then start processing first predecessor
359 NextPredIdx = 0;
360 }
361
362 if (Value->isPHIDef()) {
363 // Each predecessor node in the phi must be processed as a subgraph
364 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
365 assert(MBB && "Phi-def has no defining MBB");
366
367 // Find next predecessor to process
368 unsigned Idx = NextPredIdx;
369 const auto *PI = MBB->pred_begin() + Idx;
370 const auto *PE = MBB->pred_end();
371 for (; PI != PE && !NextValue; ++PI, ++Idx) {
372 if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
373 if (!Visited.count(VisitKey(VN, DefinedLanes)))
374 NextValue = VN;
375 }
376 }
377
378 // If there are more predecessors to process; add phi to stack
379 if (PI != PE)
380 PhiStack.emplace_back(Value, Idx, DefinedLanes);
381 } else {
382 MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
383 assert(MI && "Def has no defining instruction");
384
385 if (Reg.isVirtual()) {
386 // Iterate over all operands to find relevant definitions
387 bool HasDef = false;
388 for (const MachineOperand &Op : MI->all_defs()) {
389 if (Op.getReg() != Reg)
390 continue;
391
392 // Compute lanes defined and overlap with use
393 LaneBitmask OpLanes =
394 Op.isUndef() ? LaneBitmask::getAll()
395 : TRI->getSubRegIndexLaneMask(Op.getSubReg());
396 LaneBitmask Overlap = (UseLanes & OpLanes);
397
398 // Record if this instruction defined any of use
399 HasDef |= Overlap.any();
400
401 // Mark any lanes defined
402 DefinedLanes |= OpLanes;
403 }
404
405 // Check if all lanes of use have been defined
406 if ((DefinedLanes & UseLanes) != UseLanes) {
407 // Definition not complete; need to process input value
408 LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
409 if (const VNInfo *VN = LRQ.valueIn()) {
410 if (!Visited.count(VisitKey(VN, DefinedLanes)))
411 NextValue = VN;
412 }
413 }
414
415 // Only mark the instruction if it defines some part of the use
416 if (HasDef)
417 markInstruction(*MI, Flag, Worklist);
418 } else {
419 // For physical registers simply mark the defining instruction
420 markInstruction(*MI, Flag, Worklist);
421 }
422 }
423
424 if (!NextValue && !PhiStack.empty()) {
425 // Reach end of chain; revert to processing last phi
426 PhiEntry &Entry = PhiStack.back();
427 NextValue = Entry.Phi;
428 NextPredIdx = Entry.PredIdx;
429 DefinedLanes = Entry.DefinedLanes;
430 PhiStack.pop_back();
431 }
432
433 Value = NextValue;
434 } while (Value);
435}
436
437void SIWholeQuadMode::markOperand(const MachineInstr &MI,
438 const MachineOperand &Op, char Flag,
439 std::vector<WorkItem> &Worklist) {
440 assert(Op.isReg());
441 Register Reg = Op.getReg();
442
443 // Ignore some hardware registers
444 switch (Reg) {
445 case AMDGPU::EXEC:
446 case AMDGPU::EXEC_LO:
447 return;
448 default:
449 break;
450 }
451
452 LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
453 << " for " << MI);
454 if (Reg.isVirtual()) {
455 LiveRange &LR = LIS->getInterval(Reg);
456 markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
457 } else {
458 // Handle physical registers that we need to track; this is mostly relevant
459 // for VCC, which can appear as the (implicit) input of a uniform branch,
460 // e.g. when a loop counter is stored in a VGPR.
461 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
462 LiveRange &LR = LIS->getRegUnit(Unit);
463 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
464 if (Value)
465 markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
466 }
467 }
468}
469
470/// Mark all instructions defining the uses in \p MI with \p Flag.
471void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
472 std::vector<WorkItem> &Worklist) {
473 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
474 << MI);
475
476 for (const MachineOperand &Use : MI.all_uses())
477 markOperand(MI, Use, Flag, Worklist);
478}
479
480// Scan instructions to determine which ones require an Exact execmask and
481// which ones seed WQM requirements.
482char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
483 std::vector<WorkItem> &Worklist) {
484 char GlobalFlags = 0;
485 bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
486 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
487 bool HasImplicitDerivatives =
489
490 // We need to visit the basic blocks in reverse post-order so that we visit
491 // defs before uses, in particular so that we don't accidentally mark an
492 // instruction as needing e.g. WQM before visiting it and realizing it needs
493 // WQM disabled.
495 for (MachineBasicBlock *MBB : RPOT) {
496 BlockInfo &BBI = Blocks[MBB];
497
498 for (MachineInstr &MI : *MBB) {
499 InstrInfo &III = Instructions[&MI];
500 unsigned Opcode = MI.getOpcode();
501 char Flags = 0;
502
503 if (TII->isWQM(Opcode)) {
504 // If LOD is not supported WQM is not needed.
505 // Only generate implicit WQM if implicit derivatives are required.
506 // This avoids inserting unintended WQM if a shader type without
507 // implicit derivatives uses an image sampling instruction.
508 if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
509 // Sampling instructions don't need to produce results for all pixels
510 // in a quad, they just require all inputs of a quad to have been
511 // computed for derivatives.
512 markInstructionUses(MI, StateWQM, Worklist);
513 GlobalFlags |= StateWQM;
514 }
515 } else if (Opcode == AMDGPU::WQM) {
516 // The WQM intrinsic requires its output to have all the helper lanes
517 // correct, so we need it to be in WQM.
518 Flags = StateWQM;
519 LowerToCopyInstrs.insert(&MI);
520 } else if (Opcode == AMDGPU::SOFT_WQM) {
521 LowerToCopyInstrs.insert(&MI);
522 SoftWQMInstrs.push_back(&MI);
523 } else if (Opcode == AMDGPU::STRICT_WWM) {
524 // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
525 // it needs to be executed in WQM or Exact so that its copy doesn't
526 // clobber inactive lanes.
527 markInstructionUses(MI, StateStrictWWM, Worklist);
528 GlobalFlags |= StateStrictWWM;
529 LowerToMovInstrs.push_back(&MI);
530 } else if (Opcode == AMDGPU::STRICT_WQM ||
531 TII->isDualSourceBlendEXP(MI)) {
532 // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
533 // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
534 // quads that have at least one active thread.
535 markInstructionUses(MI, StateStrictWQM, Worklist);
536 GlobalFlags |= StateStrictWQM;
537
538 if (Opcode == AMDGPU::STRICT_WQM) {
539 LowerToMovInstrs.push_back(&MI);
540 } else {
541 // Dual source blend export acts as implicit strict-wqm, its sources
542 // need to be shuffled in strict wqm, but the export itself needs to
543 // run in exact mode.
544 BBI.Needs |= StateExact;
545 if (!(BBI.InNeeds & StateExact)) {
546 BBI.InNeeds |= StateExact;
547 Worklist.emplace_back(MBB);
548 }
549 GlobalFlags |= StateExact;
550 III.Disabled = StateWQM | StateStrict;
551 }
552 } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
553 Opcode == AMDGPU::DS_PARAM_LOAD ||
554 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
555 Opcode == AMDGPU::DS_DIRECT_LOAD) {
556 // Mark these STRICTWQM, but only for the instruction, not its operands.
557 // This avoid unnecessarily marking M0 as requiring WQM.
558 III.Needs |= StateStrictWQM;
559 GlobalFlags |= StateStrictWQM;
560 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
561 // Disable strict states; StrictWQM will be added as required later.
562 III.Disabled = StateStrict;
563 MachineOperand &Inactive = MI.getOperand(4);
564 if (Inactive.isReg()) {
565 if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
566 LowerToCopyInstrs.insert(&MI);
567 else
568 markOperand(MI, Inactive, StateStrictWWM, Worklist);
569 }
570 SetInactiveInstrs.push_back(&MI);
571 BBI.NeedsLowering = true;
572 } else if (TII->isDisableWQM(MI)) {
573 BBI.Needs |= StateExact;
574 if (!(BBI.InNeeds & StateExact)) {
575 BBI.InNeeds |= StateExact;
576 Worklist.emplace_back(MBB);
577 }
578 GlobalFlags |= StateExact;
579 III.Disabled = StateWQM | StateStrict;
580 } else if (Opcode == AMDGPU::SI_PS_LIVE ||
581 Opcode == AMDGPU::SI_LIVE_MASK) {
582 LiveMaskQueries.push_back(&MI);
583 } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
584 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
585 Opcode == AMDGPU::SI_DEMOTE_I1) {
586 KillInstrs.push_back(&MI);
587 BBI.NeedsLowering = true;
588 } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
589 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
590 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
591 InitExecInstrs.push_back(&MI);
592 } else if (WQMOutputs) {
593 // The function is in machine SSA form, which means that physical
594 // VGPRs correspond to shader inputs and outputs. Inputs are
595 // only used, outputs are only defined.
596 // FIXME: is this still valid?
597 for (const MachineOperand &MO : MI.defs()) {
598 Register Reg = MO.getReg();
599 if (Reg.isPhysical() &&
600 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
601 Flags = StateWQM;
602 break;
603 }
604 }
605 }
606
607 if (Flags) {
608 markInstruction(MI, Flags, Worklist);
609 GlobalFlags |= Flags;
610 }
611 }
612 }
613
614 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
615 // ever used anywhere in the function. This implements the corresponding
616 // semantics of @llvm.amdgcn.set.inactive.
617 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
618 if (GlobalFlags & StateWQM) {
619 for (MachineInstr *MI : SetInactiveInstrs)
620 markInstruction(*MI, StateWQM, Worklist);
621 for (MachineInstr *MI : SoftWQMInstrs)
622 markInstruction(*MI, StateWQM, Worklist);
623 }
624
625 return GlobalFlags;
626}
627
628void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
629 std::vector<WorkItem>& Worklist) {
630 MachineBasicBlock *MBB = MI.getParent();
631 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
632 BlockInfo &BI = Blocks[MBB];
633
634 // Control flow-type instructions and stores to temporary memory that are
635 // followed by WQM computations must themselves be in WQM.
636 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
637 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
638 Instructions[&MI].Needs = StateWQM;
639 II.Needs = StateWQM;
640 }
641
642 // Propagate to block level
643 if (II.Needs & StateWQM) {
644 BI.Needs |= StateWQM;
645 if (!(BI.InNeeds & StateWQM)) {
646 BI.InNeeds |= StateWQM;
647 Worklist.emplace_back(MBB);
648 }
649 }
650
651 // Propagate backwards within block
652 if (MachineInstr *PrevMI = MI.getPrevNode()) {
653 char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
654 if (!PrevMI->isPHI()) {
655 InstrInfo &PrevII = Instructions[PrevMI];
656 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
657 PrevII.OutNeeds |= InNeeds;
658 Worklist.emplace_back(PrevMI);
659 }
660 }
661 }
662
663 // Propagate WQM flag to instruction inputs
664 assert(!(II.Needs & StateExact));
665
666 if (II.Needs != 0)
667 markInstructionUses(MI, II.Needs, Worklist);
668
669 // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
670 // not require any WQM transitions.
671 if (II.Needs & StateStrictWWM)
672 BI.Needs |= StateStrictWWM;
673 if (II.Needs & StateStrictWQM)
674 BI.Needs |= StateStrictWQM;
675}
676
677void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
678 std::vector<WorkItem>& Worklist) {
679 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
680
681 // Propagate through instructions
682 if (!MBB.empty()) {
683 MachineInstr *LastMI = &*MBB.rbegin();
684 InstrInfo &LastII = Instructions[LastMI];
685 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
686 LastII.OutNeeds |= BI.OutNeeds;
687 Worklist.emplace_back(LastMI);
688 }
689 }
690
691 // Predecessor blocks must provide for our WQM/Exact needs.
692 for (MachineBasicBlock *Pred : MBB.predecessors()) {
693 BlockInfo &PredBI = Blocks[Pred];
694 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
695 continue;
696
697 PredBI.OutNeeds |= BI.InNeeds;
698 PredBI.InNeeds |= BI.InNeeds;
699 Worklist.emplace_back(Pred);
700 }
701
702 // All successors must be prepared to accept the same set of WQM/Exact data.
703 for (MachineBasicBlock *Succ : MBB.successors()) {
704 BlockInfo &SuccBI = Blocks[Succ];
705 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
706 continue;
707
708 SuccBI.InNeeds |= BI.OutNeeds;
709 Worklist.emplace_back(Succ);
710 }
711}
712
713char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
714 std::vector<WorkItem> Worklist;
715 char GlobalFlags = scanInstructions(MF, Worklist);
716
717 while (!Worklist.empty()) {
718 WorkItem WI = Worklist.back();
719 Worklist.pop_back();
720
721 if (WI.MI)
722 propagateInstruction(*WI.MI, Worklist);
723 else
724 propagateBlock(*WI.MBB, Worklist);
725 }
726
727 return GlobalFlags;
728}
729
731SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
733 Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
734
735 MachineInstr *Save =
736 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
737 .addReg(AMDGPU::SCC);
738 MachineInstr *Restore =
739 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
740 .addReg(SaveReg);
741
742 LIS->InsertMachineInstrInMaps(*Save);
743 LIS->InsertMachineInstrInMaps(*Restore);
744 LIS->createAndComputeVirtRegInterval(SaveReg);
745
746 return Restore;
747}
748
749MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
750 MachineInstr *TermMI) {
751 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
752 << *TermMI << "\n");
753
754 MachineBasicBlock *SplitBB =
755 BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
756
757 // Convert last instruction in block to a terminator.
758 // Note: this only covers the expected patterns
759 unsigned NewOpcode = 0;
760 switch (TermMI->getOpcode()) {
761 case AMDGPU::S_AND_B32:
762 NewOpcode = AMDGPU::S_AND_B32_term;
763 break;
764 case AMDGPU::S_AND_B64:
765 NewOpcode = AMDGPU::S_AND_B64_term;
766 break;
767 case AMDGPU::S_MOV_B32:
768 NewOpcode = AMDGPU::S_MOV_B32_term;
769 break;
770 case AMDGPU::S_MOV_B64:
771 NewOpcode = AMDGPU::S_MOV_B64_term;
772 break;
773 default:
774 break;
775 }
776 if (NewOpcode)
777 TermMI->setDesc(TII->get(NewOpcode));
778
779 if (SplitBB != BB) {
780 // Update dominator trees
781 using DomTreeT = DomTreeBase<MachineBasicBlock>;
783 for (MachineBasicBlock *Succ : SplitBB->successors()) {
784 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
785 DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
786 }
787 DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
788 if (MDT)
789 MDT->applyUpdates(DTUpdates);
790 if (PDT)
791 PDT->applyUpdates(DTUpdates);
792
793 // Link blocks
795 BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
796 .addMBB(SplitBB);
797 LIS->InsertMachineInstrInMaps(*MI);
798 }
799
800 return SplitBB;
801}
802
803MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
804 MachineInstr &MI) {
805 assert(LiveMaskReg.isVirtual());
806
807 const DebugLoc &DL = MI.getDebugLoc();
808 unsigned Opcode = 0;
809
810 assert(MI.getOperand(0).isReg());
811
812 // Comparison is for live lanes; however here we compute the inverse
813 // (killed lanes). This is because VCMP will always generate 0 bits
814 // for inactive lanes so a mask of live lanes would not be correct
815 // inside control flow.
816 // Invert the comparison by swapping the operands and adjusting
817 // the comparison codes.
818
819 switch (MI.getOperand(2).getImm()) {
820 case ISD::SETUEQ:
821 Opcode = AMDGPU::V_CMP_LG_F32_e64;
822 break;
823 case ISD::SETUGT:
824 Opcode = AMDGPU::V_CMP_GE_F32_e64;
825 break;
826 case ISD::SETUGE:
827 Opcode = AMDGPU::V_CMP_GT_F32_e64;
828 break;
829 case ISD::SETULT:
830 Opcode = AMDGPU::V_CMP_LE_F32_e64;
831 break;
832 case ISD::SETULE:
833 Opcode = AMDGPU::V_CMP_LT_F32_e64;
834 break;
835 case ISD::SETUNE:
836 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
837 break;
838 case ISD::SETO:
839 Opcode = AMDGPU::V_CMP_O_F32_e64;
840 break;
841 case ISD::SETUO:
842 Opcode = AMDGPU::V_CMP_U_F32_e64;
843 break;
844 case ISD::SETOEQ:
845 case ISD::SETEQ:
846 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
847 break;
848 case ISD::SETOGT:
849 case ISD::SETGT:
850 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
851 break;
852 case ISD::SETOGE:
853 case ISD::SETGE:
854 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
855 break;
856 case ISD::SETOLT:
857 case ISD::SETLT:
858 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
859 break;
860 case ISD::SETOLE:
861 case ISD::SETLE:
862 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
863 break;
864 case ISD::SETONE:
865 case ISD::SETNE:
866 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
867 break;
868 default:
869 llvm_unreachable("invalid ISD:SET cond code");
870 }
871
872 // Pick opcode based on comparison type.
873 MachineInstr *VcmpMI;
874 const MachineOperand &Op0 = MI.getOperand(0);
875 const MachineOperand &Op1 = MI.getOperand(1);
876
877 // VCC represents lanes killed.
878 Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
879
880 if (TRI->isVGPR(*MRI, Op0.getReg())) {
881 Opcode = AMDGPU::getVOPe32(Opcode);
882 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
883 } else {
884 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
886 .addImm(0) // src0 modifiers
887 .add(Op1)
888 .addImm(0) // src1 modifiers
889 .add(Op0)
890 .addImm(0); // omod
891 }
892
893 MachineInstr *MaskUpdateMI =
894 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
895 .addReg(LiveMaskReg)
896 .addReg(VCC);
897
898 // State of SCC represents whether any lanes are live in mask,
899 // if SCC is 0 then no lanes will be alive anymore.
900 MachineInstr *EarlyTermMI =
901 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
902
903 MachineInstr *ExecMaskMI =
904 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
905
906 assert(MBB.succ_size() == 1);
907 MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
908 .addMBB(*MBB.succ_begin());
909
910 // Update live intervals
911 LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
912 MBB.remove(&MI);
913
914 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
915 LIS->InsertMachineInstrInMaps(*ExecMaskMI);
916 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
917 LIS->InsertMachineInstrInMaps(*NewTerm);
918
919 return NewTerm;
920}
921
922MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
923 MachineInstr &MI, bool IsWQM) {
924 assert(LiveMaskReg.isVirtual());
925
926 const DebugLoc &DL = MI.getDebugLoc();
927 MachineInstr *MaskUpdateMI = nullptr;
928
929 const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
930 const MachineOperand &Op = MI.getOperand(0);
931 int64_t KillVal = MI.getOperand(1).getImm();
932 MachineInstr *ComputeKilledMaskMI = nullptr;
933 Register CndReg = !Op.isImm() ? Op.getReg() : Register();
934 Register TmpReg;
935
936 // Is this a static or dynamic kill?
937 if (Op.isImm()) {
938 if (Op.getImm() == KillVal) {
939 // Static: all active lanes are killed
940 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
941 .addReg(LiveMaskReg)
942 .addReg(Exec);
943 } else {
944 // Static: kill does nothing
945 MachineInstr *NewTerm = nullptr;
946 if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
947 LIS->RemoveMachineInstrFromMaps(MI);
948 } else {
949 assert(MBB.succ_size() == 1);
950 NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
951 .addMBB(*MBB.succ_begin());
952 LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
953 }
954 MBB.remove(&MI);
955 return NewTerm;
956 }
957 } else {
958 if (!KillVal) {
959 // Op represents live lanes after kill,
960 // so exec mask needs to be factored in.
961 TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
962 ComputeKilledMaskMI =
963 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), TmpReg).addReg(Exec).add(Op);
964 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
965 .addReg(LiveMaskReg)
966 .addReg(TmpReg);
967 } else {
968 // Op represents lanes to kill
969 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
970 .addReg(LiveMaskReg)
971 .add(Op);
972 }
973 }
974
975 // State of SCC represents whether any lanes are live in mask,
976 // if SCC is 0 then no lanes will be alive anymore.
977 MachineInstr *EarlyTermMI =
978 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
979
980 // In the case we got this far some lanes are still live,
981 // update EXEC to deactivate lanes as appropriate.
982 MachineInstr *NewTerm;
983 MachineInstr *WQMMaskMI = nullptr;
984 Register LiveMaskWQM;
985 if (IsDemote) {
986 // Demote - deactivate quads with only helper lanes
987 LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
988 WQMMaskMI =
989 BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
990 NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
991 .addReg(Exec)
992 .addReg(LiveMaskWQM);
993 } else {
994 // Kill - deactivate lanes no longer in live mask
995 if (Op.isImm()) {
996 unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
997 NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
998 } else if (!IsWQM) {
999 NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1000 .addReg(Exec)
1001 .addReg(LiveMaskReg);
1002 } else {
1003 unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1004 NewTerm =
1005 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1006 }
1007 }
1008
1009 // Update live intervals
1010 LIS->RemoveMachineInstrFromMaps(MI);
1011 MBB.remove(&MI);
1012 assert(EarlyTermMI);
1013 assert(MaskUpdateMI);
1014 assert(NewTerm);
1015 if (ComputeKilledMaskMI)
1016 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1017 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1018 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1019 if (WQMMaskMI)
1020 LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1021 LIS->InsertMachineInstrInMaps(*NewTerm);
1022
1023 if (CndReg) {
1024 LIS->removeInterval(CndReg);
1025 LIS->createAndComputeVirtRegInterval(CndReg);
1026 }
1027 if (TmpReg)
1028 LIS->createAndComputeVirtRegInterval(TmpReg);
1029 if (LiveMaskWQM)
1030 LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1031
1032 return NewTerm;
1033}
1034
1035// Replace (or supplement) instructions accessing live mask.
1036// This can only happen once all the live mask registers have been created
1037// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1038void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1039 auto *BII = Blocks.find(&MBB);
1040 if (BII == Blocks.end())
1041 return;
1042
1043 const BlockInfo &BI = BII->second;
1044 if (!BI.NeedsLowering)
1045 return;
1046
1047 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1048
1050 Register ActiveLanesReg = 0;
1051 char State = BI.InitialState;
1052
1055 if (StateTransition.count(&MI))
1056 State = StateTransition[&MI];
1057
1058 MachineInstr *SplitPoint = nullptr;
1059 switch (MI.getOpcode()) {
1060 case AMDGPU::SI_DEMOTE_I1:
1061 case AMDGPU::SI_KILL_I1_TERMINATOR:
1062 SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1063 break;
1064 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1065 SplitPoint = lowerKillF32(MBB, MI);
1066 break;
1067 case AMDGPU::ENTER_STRICT_WWM:
1068 ActiveLanesReg = MI.getOperand(0).getReg();
1069 break;
1070 case AMDGPU::EXIT_STRICT_WWM:
1071 ActiveLanesReg = 0;
1072 break;
1073 case AMDGPU::V_SET_INACTIVE_B32:
1074 if (ActiveLanesReg) {
1075 LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
1076 MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
1077 MI.getOperand(5).setReg(ActiveLanesReg);
1078 LIS->shrinkToUses(&LI);
1079 } else {
1080 assert(State == StateExact || State == StateWQM);
1081 }
1082 break;
1083 default:
1084 break;
1085 }
1086 if (SplitPoint)
1087 SplitPoints.push_back(SplitPoint);
1088 }
1089
1090 // Perform splitting after instruction scan to simplify iteration.
1091 if (!SplitPoints.empty()) {
1092 MachineBasicBlock *BB = &MBB;
1093 for (MachineInstr *MI : SplitPoints) {
1094 BB = splitBlock(BB, MI);
1095 }
1096 }
1097}
1098
1099// Return an iterator in the (inclusive) range [First, Last] at which
1100// instructions can be safely inserted, keeping in mind that some of the
1101// instructions we want to add necessarily clobber SCC.
1102MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1104 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1105 if (!SaveSCC)
1106 return PreferLast ? Last : First;
1107
1108 LiveRange &LR =
1109 LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1110 auto MBBE = MBB.end();
1111 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1112 : LIS->getMBBEndIdx(&MBB);
1113 SlotIndex LastIdx =
1114 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1115 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1116 const LiveRange::Segment *S;
1117
1118 for (;;) {
1119 S = LR.getSegmentContaining(Idx);
1120 if (!S)
1121 break;
1122
1123 if (PreferLast) {
1124 SlotIndex Next = S->start.getBaseIndex();
1125 if (Next < FirstIdx)
1126 break;
1127 Idx = Next;
1128 } else {
1129 MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1130 assert(EndMI && "Segment does not end on valid instruction");
1131 auto NextI = std::next(EndMI->getIterator());
1132 if (NextI == MBB.end())
1133 break;
1134 SlotIndex Next = LIS->getInstructionIndex(*NextI);
1135 if (Next > LastIdx)
1136 break;
1137 Idx = Next;
1138 }
1139 }
1140
1142
1143 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1144 MBBI = MI;
1145 else {
1146 assert(Idx == LIS->getMBBEndIdx(&MBB));
1147 MBBI = MBB.end();
1148 }
1149
1150 // Move insertion point past any operations modifying EXEC.
1151 // This assumes that the value of SCC defined by any of these operations
1152 // does not need to be preserved.
1153 while (MBBI != Last) {
1154 bool IsExecDef = false;
1155 for (const MachineOperand &MO : MBBI->all_defs()) {
1156 IsExecDef |=
1157 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1158 }
1159 if (!IsExecDef)
1160 break;
1161 MBBI++;
1162 S = nullptr;
1163 }
1164
1165 if (S)
1166 MBBI = saveSCC(MBB, MBBI);
1167
1168 return MBBI;
1169}
1170
1171void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1173 Register SaveWQM) {
1174 assert(LiveMaskReg.isVirtual());
1175
1176 bool IsTerminator = Before == MBB.end();
1177 if (!IsTerminator) {
1178 auto FirstTerm = MBB.getFirstTerminator();
1179 if (FirstTerm != MBB.end()) {
1180 SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1181 SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1182 IsTerminator = BeforeIdx > FirstTermIdx;
1183 }
1184 }
1185
1187
1188 if (SaveWQM) {
1189 unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1190 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1191 .addReg(LiveMaskReg);
1192 } else {
1193 unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1194 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1195 .addReg(Exec)
1196 .addReg(LiveMaskReg);
1197 }
1198
1199 LIS->InsertMachineInstrInMaps(*MI);
1200 StateTransition[MI] = StateExact;
1201}
1202
1203void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1205 Register SavedWQM) {
1207
1208 if (SavedWQM) {
1209 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1210 .addReg(SavedWQM);
1211 } else {
1212 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1213 }
1214
1215 LIS->InsertMachineInstrInMaps(*MI);
1216 StateTransition[MI] = StateWQM;
1217}
1218
1219void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1221 Register SaveOrig, char StrictStateNeeded) {
1223 assert(SaveOrig);
1224 assert(StrictStateNeeded == StateStrictWWM ||
1225 StrictStateNeeded == StateStrictWQM);
1226
1227 if (StrictStateNeeded == StateStrictWWM) {
1228 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1229 SaveOrig)
1230 .addImm(-1);
1231 } else {
1232 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1233 SaveOrig)
1234 .addImm(-1);
1235 }
1236 LIS->InsertMachineInstrInMaps(*MI);
1237 StateTransition[MI] = StrictStateNeeded;
1238}
1239
1240void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1242 Register SavedOrig, char NonStrictState,
1243 char CurrentStrictState) {
1245
1246 assert(SavedOrig);
1247 assert(CurrentStrictState == StateStrictWWM ||
1248 CurrentStrictState == StateStrictWQM);
1249
1250 if (CurrentStrictState == StateStrictWWM) {
1251 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1252 Exec)
1253 .addReg(SavedOrig);
1254 } else {
1255 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1256 Exec)
1257 .addReg(SavedOrig);
1258 }
1259 LIS->InsertMachineInstrInMaps(*MI);
1260 StateTransition[MI] = NonStrictState;
1261}
1262
1263void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1264 auto *BII = Blocks.find(&MBB);
1265 if (BII == Blocks.end())
1266 return;
1267
1268 BlockInfo &BI = BII->second;
1269
1270 // This is a non-entry block that is WQM throughout, so no need to do
1271 // anything.
1272 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1273 BI.InitialState = StateWQM;
1274 return;
1275 }
1276
1277 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1278 << ":\n");
1279
1280 Register SavedWQMReg;
1281 Register SavedNonStrictReg;
1282 bool WQMFromExec = IsEntry;
1283 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1284 char NonStrictState = 0;
1285 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1286
1287 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1288 if (IsEntry) {
1289 // Skip the instruction that saves LiveMask
1290 if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1291 II->getOperand(1).getReg() == TRI->getExec())
1292 ++II;
1293 }
1294
1295 // This stores the first instruction where it's safe to switch from WQM to
1296 // Exact or vice versa.
1298
1299 // This stores the first instruction where it's safe to switch from Strict
1300 // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1301 // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1302 // be safe to switch to/from WQM as well.
1303 MachineBasicBlock::iterator FirstStrict = IE;
1304
1305 // Record initial state is block information.
1306 BI.InitialState = State;
1307
1308 for (unsigned Idx = 0;; ++Idx) {
1310 char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1311 char OutNeeds = 0;
1312
1313 if (FirstWQM == IE)
1314 FirstWQM = II;
1315
1316 if (FirstStrict == IE)
1317 FirstStrict = II;
1318
1319 // Adjust needs if this is first instruction of WQM requiring shader.
1320 if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1321 Needs = StateWQM;
1322
1323 // First, figure out the allowed states (Needs) based on the propagated
1324 // flags.
1325 if (II != IE) {
1326 MachineInstr &MI = *II;
1327
1328 if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1329 auto III = Instructions.find(&MI);
1330 if (III != Instructions.end()) {
1331 if (III->second.Needs & StateStrictWWM)
1332 Needs = StateStrictWWM;
1333 else if (III->second.Needs & StateStrictWQM)
1334 Needs = StateStrictWQM;
1335 else if (III->second.Needs & StateWQM)
1336 Needs = StateWQM;
1337 else
1338 Needs &= ~III->second.Disabled;
1339 OutNeeds = III->second.OutNeeds;
1340 }
1341 } else {
1342 // If the instruction doesn't actually need a correct EXEC, then we can
1343 // safely leave Strict mode enabled.
1344 Needs = StateExact | StateWQM | StateStrict;
1345 }
1346
1347 // Exact mode exit can occur in terminators, but must be before branches.
1348 if (MI.isBranch() && OutNeeds == StateExact)
1349 Needs = StateExact;
1350
1351 ++Next;
1352 } else {
1353 // End of basic block
1354 if (BI.OutNeeds & StateWQM)
1355 Needs = StateWQM;
1356 else if (BI.OutNeeds == StateExact)
1357 Needs = StateExact;
1358 else
1359 Needs = StateWQM | StateExact;
1360 }
1361
1362 // Now, transition if necessary.
1363 if (!(Needs & State)) {
1365 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1366 State == StateStrictWQM || Needs == StateStrictWQM) {
1367 // We must switch to or from Strict mode.
1368 First = FirstStrict;
1369 } else {
1370 // We only need to switch to/from WQM, so we can use FirstWQM.
1371 First = FirstWQM;
1372 }
1373
1374 // Whether we need to save SCC depends on start and end states.
1375 bool SaveSCC = false;
1376 switch (State) {
1377 case StateExact:
1378 case StateStrictWWM:
1379 case StateStrictWQM:
1380 // Exact/Strict -> Strict: save SCC
1381 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1382 // Exact/Strict -> Exact: no save
1383 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1384 break;
1385 case StateWQM:
1386 // WQM -> Exact/Strict: save SCC
1387 SaveSCC = !(Needs & StateWQM);
1388 break;
1389 default:
1390 llvm_unreachable("Unknown state");
1391 break;
1392 }
1393 char StartState = State & StateStrict ? NonStrictState : State;
1394 bool WQMToExact =
1395 StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1396 bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1397 !(Needs & StateExact);
1398 bool PreferLast = Needs == StateWQM;
1399 // Exact regions in divergent control flow may run at EXEC=0, so try to
1400 // exclude instructions with unexpected effects from them.
1401 // FIXME: ideally we would branch over these when EXEC=0,
1402 // but this requires updating implicit values, live intervals and CFG.
1403 if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1404 for (MachineBasicBlock::iterator I = First; I != II; ++I) {
1405 if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) {
1406 PreferLast = WQMToExact;
1407 break;
1408 }
1409 }
1410 }
1412 prepareInsertion(MBB, First, II, PreferLast, SaveSCC);
1413
1414 if (State & StateStrict) {
1415 assert(State == StateStrictWWM || State == StateStrictWQM);
1416 assert(SavedNonStrictReg);
1417 fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1418
1419 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1420 SavedNonStrictReg = 0;
1421 State = NonStrictState;
1422 }
1423
1424 if (Needs & StateStrict) {
1425 NonStrictState = State;
1426 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1427 assert(!SavedNonStrictReg);
1428 SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1429
1430 toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1431 State = Needs;
1432 } else {
1433 if (WQMToExact) {
1434 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1435 assert(!SavedWQMReg);
1436 SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1437 }
1438
1439 toExact(MBB, Before, SavedWQMReg);
1440 State = StateExact;
1441 } else if (ExactToWQM) {
1442 assert(WQMFromExec == (SavedWQMReg == 0));
1443
1444 toWQM(MBB, Before, SavedWQMReg);
1445
1446 if (SavedWQMReg) {
1447 LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1448 SavedWQMReg = 0;
1449 }
1450 State = StateWQM;
1451 } else {
1452 // We can get here if we transitioned from StrictWWM to a
1453 // non-StrictWWM state that already matches our needs, but we
1454 // shouldn't need to do anything.
1455 assert(Needs & State);
1456 }
1457 }
1458 }
1459
1460 if (Needs != (StateExact | StateWQM | StateStrict)) {
1461 if (Needs != (StateExact | StateWQM))
1462 FirstWQM = IE;
1463 FirstStrict = IE;
1464 }
1465
1466 if (II == IE)
1467 break;
1468
1469 II = Next;
1470 }
1471 assert(!SavedWQMReg);
1472 assert(!SavedNonStrictReg);
1473}
1474
1475bool SIWholeQuadMode::lowerLiveMaskQueries() {
1476 for (MachineInstr *MI : LiveMaskQueries) {
1477 const DebugLoc &DL = MI->getDebugLoc();
1478 Register Dest = MI->getOperand(0).getReg();
1479
1481 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1482 .addReg(LiveMaskReg);
1483
1484 LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1485 MI->eraseFromParent();
1486 }
1487 return !LiveMaskQueries.empty();
1488}
1489
1490bool SIWholeQuadMode::lowerCopyInstrs() {
1491 for (MachineInstr *MI : LowerToMovInstrs) {
1492 assert(MI->getNumExplicitOperands() == 2);
1493
1494 const Register Reg = MI->getOperand(0).getReg();
1495
1496 const TargetRegisterClass *regClass =
1497 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1498 if (TRI->isVGPRClass(regClass)) {
1499 const unsigned MovOp = TII->getMovOpcode(regClass);
1500 MI->setDesc(TII->get(MovOp));
1501
1502 // Check that it already implicitly depends on exec (like all VALU movs
1503 // should do).
1504 assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1505 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1506 }));
1507 } else {
1508 // Remove early-clobber and exec dependency from simple SGPR copies.
1509 // This allows some to be eliminated during/post RA.
1510 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1511 if (MI->getOperand(0).isEarlyClobber()) {
1512 LIS->removeInterval(Reg);
1513 MI->getOperand(0).setIsEarlyClobber(false);
1514 LIS->createAndComputeVirtRegInterval(Reg);
1515 }
1516 int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1517 while (Index >= 0) {
1518 MI->removeOperand(Index);
1519 Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1520 }
1521 MI->setDesc(TII->get(AMDGPU::COPY));
1522 LLVM_DEBUG(dbgs() << " -> " << *MI);
1523 }
1524 }
1525 for (MachineInstr *MI : LowerToCopyInstrs) {
1526 LLVM_DEBUG(dbgs() << "simplify: " << *MI);
1527
1528 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1529 assert(MI->getNumExplicitOperands() == 6);
1530
1531 LiveInterval *RecomputeLI = nullptr;
1532 if (MI->getOperand(4).isReg())
1533 RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
1534
1535 MI->removeOperand(5);
1536 MI->removeOperand(4);
1537 MI->removeOperand(3);
1538 MI->removeOperand(1);
1539
1540 if (RecomputeLI)
1541 LIS->shrinkToUses(RecomputeLI);
1542 } else {
1543 assert(MI->getNumExplicitOperands() == 2);
1544 }
1545
1546 unsigned CopyOp = MI->getOperand(1).isReg()
1547 ? (unsigned)AMDGPU::COPY
1548 : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1549 *MRI, MI->getOperand(0)));
1550 MI->setDesc(TII->get(CopyOp));
1551 LLVM_DEBUG(dbgs() << " -> " << *MI);
1552 }
1553 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1554}
1555
1556bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1557 for (MachineInstr *MI : KillInstrs) {
1558 MachineBasicBlock *MBB = MI->getParent();
1559 MachineInstr *SplitPoint = nullptr;
1560 switch (MI->getOpcode()) {
1561 case AMDGPU::SI_DEMOTE_I1:
1562 case AMDGPU::SI_KILL_I1_TERMINATOR:
1563 SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1564 break;
1565 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1566 SplitPoint = lowerKillF32(*MBB, *MI);
1567 break;
1568 }
1569 if (SplitPoint)
1570 splitBlock(MBB, SplitPoint);
1571 }
1572 return !KillInstrs.empty();
1573}
1574
1575void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1576 MachineBasicBlock *MBB = MI.getParent();
1577 bool IsWave32 = ST->isWave32();
1578
1579 if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1580 assert(MBB == &MBB->getParent()->front() &&
1581 "init whole wave not in entry block");
1582 Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
1583 MachineInstr *SaveExec =
1584 BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1585 TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1586 : AMDGPU::S_OR_SAVEEXEC_B64),
1587 EntryExec)
1588 .addImm(-1);
1589
1590 // Replace all uses of MI's destination reg with EntryExec.
1591 MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
1592
1593 if (LIS) {
1594 LIS->RemoveMachineInstrFromMaps(MI);
1595 }
1596
1597 MI.eraseFromParent();
1598
1599 if (LIS) {
1600 LIS->InsertMachineInstrInMaps(*SaveExec);
1601 LIS->createAndComputeVirtRegInterval(EntryExec);
1602 }
1603 return;
1604 }
1605
1606 if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1607 // This should be before all vector instructions.
1608 MachineInstr *InitMI =
1609 BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1610 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1611 Exec)
1612 .addImm(MI.getOperand(0).getImm());
1613 if (LIS) {
1614 LIS->RemoveMachineInstrFromMaps(MI);
1615 LIS->InsertMachineInstrInMaps(*InitMI);
1616 }
1617 MI.eraseFromParent();
1618 return;
1619 }
1620
1621 // Extract the thread count from an SGPR input and set EXEC accordingly.
1622 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1623 //
1624 // S_BFE_U32 count, input, {shift, 7}
1625 // S_BFM_B64 exec, count, 0
1626 // S_CMP_EQ_U32 count, 64
1627 // S_CMOV_B64 exec, -1
1628 Register InputReg = MI.getOperand(0).getReg();
1629 MachineInstr *FirstMI = &*MBB->begin();
1630 if (InputReg.isVirtual()) {
1631 MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1632 assert(DefInstr && DefInstr->isCopy());
1633 if (DefInstr->getParent() == MBB) {
1634 if (DefInstr != FirstMI) {
1635 // If the `InputReg` is defined in current block, we also need to
1636 // move that instruction to the beginning of the block.
1637 DefInstr->removeFromParent();
1638 MBB->insert(FirstMI, DefInstr);
1639 if (LIS)
1640 LIS->handleMove(*DefInstr);
1641 } else {
1642 // If first instruction is definition then move pointer after it.
1643 FirstMI = &*std::next(FirstMI->getIterator());
1644 }
1645 }
1646 }
1647
1648 // Insert instruction sequence at block beginning (before vector operations).
1649 const DebugLoc DL = MI.getDebugLoc();
1650 const unsigned WavefrontSize = ST->getWavefrontSize();
1651 const unsigned Mask = (WavefrontSize << 1) - 1;
1652 Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1653 auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1654 .addReg(InputReg)
1655 .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1656 auto BfmMI =
1657 BuildMI(*MBB, FirstMI, DL,
1658 TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1659 .addReg(CountReg)
1660 .addImm(0);
1661 auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1662 .addReg(CountReg, RegState::Kill)
1663 .addImm(WavefrontSize);
1664 auto CmovMI =
1665 BuildMI(*MBB, FirstMI, DL,
1666 TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1667 Exec)
1668 .addImm(-1);
1669
1670 if (!LIS) {
1671 MI.eraseFromParent();
1672 return;
1673 }
1674
1675 LIS->RemoveMachineInstrFromMaps(MI);
1676 MI.eraseFromParent();
1677
1678 LIS->InsertMachineInstrInMaps(*BfeMI);
1679 LIS->InsertMachineInstrInMaps(*BfmMI);
1680 LIS->InsertMachineInstrInMaps(*CmpMI);
1681 LIS->InsertMachineInstrInMaps(*CmovMI);
1682
1683 LIS->removeInterval(InputReg);
1684 LIS->createAndComputeVirtRegInterval(InputReg);
1685 LIS->createAndComputeVirtRegInterval(CountReg);
1686}
1687
1688/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1689/// for instructions that depend on EXEC.
1691SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1692 MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1693
1694 for (MachineInstr *MI : InitExecInstrs) {
1695 // Try to handle undefined cases gracefully:
1696 // - multiple INIT_EXEC instructions
1697 // - INIT_EXEC instructions not in the entry block
1698 if (MI->getParent() == &Entry)
1699 InsertPt = std::next(MI->getIterator());
1700
1701 lowerInitExec(*MI);
1702 Changed = true;
1703 }
1704
1705 return InsertPt;
1706}
1707
1708bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1709 LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1710 << " ------------- \n");
1711 LLVM_DEBUG(MF.dump(););
1712
1713 Instructions.clear();
1714 Blocks.clear();
1715 LiveMaskQueries.clear();
1716 LowerToCopyInstrs.clear();
1717 LowerToMovInstrs.clear();
1718 KillInstrs.clear();
1719 InitExecInstrs.clear();
1720 SetInactiveInstrs.clear();
1721 StateTransition.clear();
1722
1723 ST = &MF.getSubtarget<GCNSubtarget>();
1724
1725 TII = ST->getInstrInfo();
1726 TRI = &TII->getRegisterInfo();
1727 MRI = &MF.getRegInfo();
1728 LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1729 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1730 MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1731 auto *PDTWrapper =
1732 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1733 PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1734
1735 if (ST->isWave32()) {
1736 AndOpc = AMDGPU::S_AND_B32;
1737 AndTermOpc = AMDGPU::S_AND_B32_term;
1738 AndN2Opc = AMDGPU::S_ANDN2_B32;
1739 XorOpc = AMDGPU::S_XOR_B32;
1740 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1741 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1742 WQMOpc = AMDGPU::S_WQM_B32;
1743 Exec = AMDGPU::EXEC_LO;
1744 } else {
1745 AndOpc = AMDGPU::S_AND_B64;
1746 AndTermOpc = AMDGPU::S_AND_B64_term;
1747 AndN2Opc = AMDGPU::S_ANDN2_B64;
1748 XorOpc = AMDGPU::S_XOR_B64;
1749 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1750 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1751 WQMOpc = AMDGPU::S_WQM_B64;
1752 Exec = AMDGPU::EXEC;
1753 }
1754
1755 const char GlobalFlags = analyzeFunction(MF);
1756 bool Changed = false;
1757
1758 LiveMaskReg = Exec;
1759
1761 MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
1762
1763 // Store a copy of the original live mask when required
1764 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1765 const bool HasWaveModes = GlobalFlags & ~StateExact;
1766 const bool HasKills = !KillInstrs.empty();
1767 const bool UsesWQM = GlobalFlags & StateWQM;
1768 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1769 LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1770 MachineInstr *MI =
1771 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1772 .addReg(Exec);
1773 LIS->InsertMachineInstrInMaps(*MI);
1774 Changed = true;
1775 }
1776
1777 // Check if V_SET_INACTIVE was touched by a strict state mode.
1778 // If so, promote to WWM; otherwise lower to COPY.
1779 for (MachineInstr *MI : SetInactiveInstrs) {
1780 if (LowerToCopyInstrs.contains(MI))
1781 continue;
1782 if (Instructions[MI].MarkedStates & StateStrict) {
1783 Instructions[MI].Needs |= StateStrictWWM;
1784 Instructions[MI].Disabled &= ~StateStrictWWM;
1785 Blocks[MI->getParent()].Needs |= StateStrictWWM;
1786 } else {
1787 LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
1788 LowerToCopyInstrs.insert(MI);
1789 }
1790 }
1791
1792 LLVM_DEBUG(printInfo());
1793
1794 Changed |= lowerLiveMaskQueries();
1795 Changed |= lowerCopyInstrs();
1796
1797 if (!HasWaveModes) {
1798 // No wave mode execution
1799 Changed |= lowerKillInstrs(false);
1800 } else if (GlobalFlags == StateWQM) {
1801 // Shader only needs WQM
1802 auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1803 .addReg(Exec);
1804 LIS->InsertMachineInstrInMaps(*MI);
1805 lowerKillInstrs(true);
1806 Changed = true;
1807 } else {
1808 // Mark entry for WQM if required.
1809 if (GlobalFlags & StateWQM)
1810 Blocks[&Entry].InNeeds |= StateWQM;
1811 // Wave mode switching requires full lowering pass.
1812 for (auto BII : Blocks)
1813 processBlock(*BII.first, BII.first == &Entry);
1814 // Lowering blocks causes block splitting so perform as a second pass.
1815 for (auto BII : Blocks)
1816 lowerBlock(*BII.first);
1817 Changed = true;
1818 }
1819
1820 // Compute live range for live mask
1821 if (LiveMaskReg != Exec)
1822 LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1823
1824 // Physical registers like SCC aren't tracked by default anyway, so just
1825 // removing the ranges we computed is the simplest option for maintaining
1826 // the analysis results.
1827 LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1828
1829 // If we performed any kills then recompute EXEC
1830 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1831 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1832
1833 return Changed;
1834}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI Whole Quad Mode
#define DEBUG_TYPE
raw_pwrite_stream & OS
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Core dominator tree base class.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687
Result of a LiveRange query.
Definition: LiveInterval.h:90
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
Definition: LiveInterval.h:105
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:408
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:542
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarily including Idx,...
Definition: LiveInterval.h:429
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:78
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned succ_size() const
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual MachineFunctionProperties getClearedProperties() const
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool isCopy() const
MachineInstr * removeFromParent()
Unlink 'this' from the containing basic block, and return it without deleting it.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:224
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
LLVM Value Representation.
Definition: Value.h:74
self_iterator getIterator()
Definition: ilist_node.h:132
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char WavefrontSize[]
Key for Kernel::CodeProps::Metadata::mWavefrontSize.
Key
PAL metadata keys.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:148
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createSIWholeQuadModePass()
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
char & SIWholeQuadModeID
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:303
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
static constexpr LaneBitmask getAll()
Definition: LaneBitmask.h:82
constexpr bool any() const
Definition: LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:162