LLVM 23.0.0git
SIWholeQuadMode.cpp
Go to the documentation of this file.
1//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass adds instructions to enable whole quad mode (strict or non-strict)
11/// for pixel shaders, and strict whole wavefront mode for all programs.
12///
13/// The "strict" prefix indicates that inactive lanes do not take part in
14/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15/// always be enabled irrespective of control flow decisions. Conversely in
16/// non-strict WQM inactive lanes may control flow decisions.
17///
18/// Whole quad mode is required for derivative computations, but it interferes
19/// with shader side effects (stores and atomics). It ensures that WQM is
20/// enabled when necessary, but disabled around stores and atomics.
21///
22/// When necessary, this pass creates a function prolog
23///
24/// S_MOV_B64 LiveMask, EXEC
25/// S_WQM_B64 EXEC, EXEC
26///
27/// to enter WQM at the top of the function and surrounds blocks of Exact
28/// instructions by
29///
30/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31/// ...
32/// S_MOV_B64 EXEC, Tmp
33///
34/// We also compute when a sequence of instructions requires strict whole
35/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36///
37/// S_OR_SAVEEXEC_B64 Tmp, -1
38/// ...
39/// S_MOV_B64 EXEC, Tmp
40///
41/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42/// we use a similar save and restore mechanism and force whole quad mode for
43/// those instructions:
44///
45/// S_MOV_B64 Tmp, EXEC
46/// S_WQM_B64 EXEC, EXEC
47/// ...
48/// S_MOV_B64 EXEC, Tmp
49///
50/// In order to avoid excessive switching during sequences of Exact
51/// instructions, the pass first analyzes which instructions must be run in WQM
52/// (aka which instructions produce values that lead to derivative
53/// computations).
54///
55/// Basic blocks are always exited in WQM as long as some successor needs WQM.
56///
57/// There is room for improvement given better control flow analysis:
58///
59/// (1) at the top level (outside of control flow statements, and as long as
60/// kill hasn't been used), one SGPR can be saved by recovering WQM from
61/// the LiveMask (this is implemented for the entry block).
62///
63/// (2) when entire regions (e.g. if-else blocks or entire loops) only
64/// consist of exact and don't-care instructions, the switch only has to
65/// be done at the entry and exit points rather than potentially in each
66/// block of the region.
67///
68//===----------------------------------------------------------------------===//
69
70#include "SIWholeQuadMode.h"
71#include "AMDGPU.h"
72#include "AMDGPULaneMaskUtils.h"
73#include "GCNSubtarget.h"
75#include "llvm/ADT/MapVector.h"
83#include "llvm/IR/CallingConv.h"
86
87using namespace llvm;
88
89#define DEBUG_TYPE "si-wqm"
90
91namespace {
92
93enum {
94 StateWQM = 0x1,
95 StateStrictWWM = 0x2,
96 StateStrictWQM = 0x4,
97 StateExact = 0x8,
98 StateStrict = StateStrictWWM | StateStrictWQM,
99};
100
101struct PrintState {
102public:
103 int State;
104
105 explicit PrintState(int State) : State(State) {}
106};
107
108#ifndef NDEBUG
109static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
110
111 static const std::pair<char, const char *> Mapping[] = {
112 std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
113 std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
114 char State = PS.State;
115 for (auto M : Mapping) {
116 if (State & M.first) {
117 OS << M.second;
118 State &= ~M.first;
119
120 if (State)
121 OS << '|';
122 }
123 }
124 assert(State == 0);
125 return OS;
126}
127#endif
128
129struct InstrInfo {
130 char Needs = 0;
131 char Disabled = 0;
132 char OutNeeds = 0;
133 char MarkedStates = 0;
134};
135
136struct BlockInfo {
137 char Needs = 0;
138 char InNeeds = 0;
139 char OutNeeds = 0;
140 char InitialState = 0;
141 bool NeedsLowering = false;
142};
143
144struct WorkItem {
145 MachineBasicBlock *MBB = nullptr;
146 MachineInstr *MI = nullptr;
147
148 WorkItem() = default;
151};
152
153class SIWholeQuadMode {
154public:
155 SIWholeQuadMode(MachineFunction &MF, LiveIntervals *LIS,
157 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
158 TRI(&TII->getRegisterInfo()), MRI(&MF.getRegInfo()), LIS(LIS), MDT(MDT),
159 PDT(PDT), LMC(AMDGPU::LaneMaskConstants::get(*ST)) {}
160 bool run(MachineFunction &MF);
161
162private:
163 const GCNSubtarget *ST;
164 const SIInstrInfo *TII;
165 const SIRegisterInfo *TRI;
167 LiveIntervals *LIS;
170 const AMDGPU::LaneMaskConstants &LMC;
171
172 Register LiveMaskReg;
173
176
177 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
179
180 SmallVector<MachineInstr *, 2> LiveMaskQueries;
181 SmallVector<MachineInstr *, 4> LowerToMovInstrs;
182 SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
184 SmallVector<MachineInstr *, 4> InitExecInstrs;
185 SmallVector<MachineInstr *, 4> SetInactiveInstrs;
186
187 void printInfo();
188
189 void markInstruction(MachineInstr &MI, char Flag,
190 std::vector<WorkItem> &Worklist);
191 void markDefs(const MachineInstr &UseMI, LiveRange &LR,
192 VirtRegOrUnit VRegOrUnit, unsigned SubReg, char Flag,
193 std::vector<WorkItem> &Worklist);
194 void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
195 std::vector<WorkItem> &Worklist);
196 void markInstructionUses(const MachineInstr &MI, char Flag,
197 std::vector<WorkItem> &Worklist);
198 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist,
199 SmallVector<MachineInstr *> &ExeczSideEffectInstrs);
200 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
201 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
203
208 MachineBasicBlock::iterator Last, bool PreferLast,
209 bool SaveSCC);
211 Register SaveWQM);
213 Register SavedWQM);
214 void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
215 Register SaveOrig, char StrictStateNeeded);
216 void fromStrictMode(MachineBasicBlock &MBB,
217 MachineBasicBlock::iterator Before, Register SavedOrig,
218 char NonStrictState, char CurrentStrictState);
219
220 void splitBlock(MachineInstr *TermMI);
221 MachineInstr *lowerKillI1(MachineInstr &MI, bool IsWQM);
222 MachineInstr *lowerKillF32(MachineInstr &MI);
223
224 void lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI);
225 void processBlock(MachineBasicBlock &MBB, BlockInfo &BI, bool IsEntry);
226
227 bool lowerLiveMaskQueries();
228 bool lowerCopyInstrs();
229 bool lowerKillInstrs(bool IsWQM);
230 void lowerInitExec(MachineInstr &MI);
231 MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
232 bool &Changed);
233};
234
235class SIWholeQuadModeLegacy : public MachineFunctionPass {
236public:
237 static char ID;
238
239 SIWholeQuadModeLegacy() : MachineFunctionPass(ID) {}
240
241 bool runOnMachineFunction(MachineFunction &MF) override;
242
243 StringRef getPassName() const override { return "SI Whole Quad Mode"; }
244
245 void getAnalysisUsage(AnalysisUsage &AU) const override {
252 }
253
254 MachineFunctionProperties getClearedProperties() const override {
255 return MachineFunctionProperties().setIsSSA();
256 }
257};
258} // end anonymous namespace
259
260char SIWholeQuadModeLegacy::ID = 0;
261
262INITIALIZE_PASS_BEGIN(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode",
263 false, false)
267INITIALIZE_PASS_END(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode",
269
270char &llvm::SIWholeQuadModeID = SIWholeQuadModeLegacy::ID;
271
273 return new SIWholeQuadModeLegacy;
274}
275
276#ifndef NDEBUG
277LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
278 for (const auto &BII : Blocks) {
279 dbgs() << "\n"
280 << printMBBReference(*BII.first) << ":\n"
281 << " InNeeds = " << PrintState(BII.second.InNeeds)
282 << ", Needs = " << PrintState(BII.second.Needs)
283 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
284
285 for (const MachineInstr &MI : *BII.first) {
286 auto III = Instructions.find(&MI);
287 if (III != Instructions.end()) {
288 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
289 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
290 }
291 }
292 }
293}
294#endif
295
296void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
297 std::vector<WorkItem> &Worklist) {
298 InstrInfo &II = Instructions[&MI];
299
300 assert(!(Flag & StateExact) && Flag != 0);
301
302 // Capture all states requested in marking including disabled ones.
303 II.MarkedStates |= Flag;
304
305 // Remove any disabled states from the flag. The user that required it gets
306 // an undefined value in the helper lanes. For example, this can happen if
307 // the result of an atomic is used by instruction that requires WQM, where
308 // ignoring the request for WQM is correct as per the relevant specs.
309 Flag &= ~II.Disabled;
310
311 // Ignore if the flag is already encompassed by the existing needs, or we
312 // just disabled everything.
313 if ((II.Needs & Flag) == Flag)
314 return;
315
316 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
317 II.Needs |= Flag;
318 Worklist.emplace_back(&MI);
319}
320
321/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
322void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
323 VirtRegOrUnit VRegOrUnit, unsigned SubReg,
324 char Flag, std::vector<WorkItem> &Worklist) {
325 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
326
327 LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
328 const VNInfo *Value = UseLRQ.valueIn();
329 if (!Value)
330 return;
331
332 // Note: this code assumes that lane masks on AMDGPU completely
333 // cover registers.
334 const LaneBitmask UseLanes =
335 SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
336 : (VRegOrUnit.isVirtualReg()
337 ? MRI->getMaxLaneMaskForVReg(VRegOrUnit.asVirtualReg())
339
340 // Perform a depth-first iteration of the LiveRange graph marking defs.
341 // Stop processing of a given branch when all use lanes have been defined.
342 // The first definition stops processing for a physical register.
343 struct PhiEntry {
344 const VNInfo *Phi;
345 unsigned PredIdx;
346 LaneBitmask DefinedLanes;
347
348 PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
349 : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
350 };
351 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
353 SmallSet<VisitKey, 4> Visited;
354 LaneBitmask DefinedLanes;
355 unsigned NextPredIdx = 0; // Only used for processing phi nodes
356 do {
357 const VNInfo *NextValue = nullptr;
358 const VisitKey Key(Value, DefinedLanes);
359
360 if (Visited.insert(Key).second) {
361 // On first visit to a phi then start processing first predecessor
362 NextPredIdx = 0;
363 }
364
365 if (Value->isPHIDef()) {
366 // Each predecessor node in the phi must be processed as a subgraph
367 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
368 assert(MBB && "Phi-def has no defining MBB");
369
370 // Find next predecessor to process
371 unsigned Idx = NextPredIdx;
372 const auto *PI = MBB->pred_begin() + Idx;
373 const auto *PE = MBB->pred_end();
374 for (; PI != PE && !NextValue; ++PI, ++Idx) {
375 if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
376 if (!Visited.count(VisitKey(VN, DefinedLanes)))
377 NextValue = VN;
378 }
379 }
380
381 // If there are more predecessors to process; add phi to stack
382 if (PI != PE)
383 PhiStack.emplace_back(Value, Idx, DefinedLanes);
384 } else {
385 MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
386 assert(MI && "Def has no defining instruction");
387
388 if (VRegOrUnit.isVirtualReg()) {
389 // Iterate over all operands to find relevant definitions
390 bool HasDef = false;
391 for (const MachineOperand &Op : MI->all_defs()) {
392 if (Op.getReg() != VRegOrUnit.asVirtualReg())
393 continue;
394
395 // Compute lanes defined and overlap with use
396 LaneBitmask OpLanes =
397 Op.isUndef() ? LaneBitmask::getAll()
398 : TRI->getSubRegIndexLaneMask(Op.getSubReg());
399 LaneBitmask Overlap = (UseLanes & OpLanes);
400
401 // Record if this instruction defined any of use
402 HasDef |= Overlap.any();
403
404 // Mark any lanes defined
405 DefinedLanes |= OpLanes;
406 }
407
408 // Check if all lanes of use have been defined
409 if ((DefinedLanes & UseLanes) != UseLanes) {
410 // Definition not complete; need to process input value
411 LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
412 if (const VNInfo *VN = LRQ.valueIn()) {
413 if (!Visited.count(VisitKey(VN, DefinedLanes)))
414 NextValue = VN;
415 }
416 }
417
418 // Only mark the instruction if it defines some part of the use
419 if (HasDef)
420 markInstruction(*MI, Flag, Worklist);
421 } else {
422 // For physical registers simply mark the defining instruction
423 markInstruction(*MI, Flag, Worklist);
424 }
425 }
426
427 if (!NextValue && !PhiStack.empty()) {
428 // Reach end of chain; revert to processing last phi
429 PhiEntry &Entry = PhiStack.back();
430 NextValue = Entry.Phi;
431 NextPredIdx = Entry.PredIdx;
432 DefinedLanes = Entry.DefinedLanes;
433 PhiStack.pop_back();
434 }
435
436 Value = NextValue;
437 } while (Value);
438}
439
440void SIWholeQuadMode::markOperand(const MachineInstr &MI,
441 const MachineOperand &Op, char Flag,
442 std::vector<WorkItem> &Worklist) {
443 assert(Op.isReg());
444 Register Reg = Op.getReg();
445
446 // Ignore some hardware registers
447 switch (Reg) {
448 case AMDGPU::EXEC:
449 case AMDGPU::EXEC_LO:
450 return;
451 default:
452 break;
453 }
454
455 LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
456 << " for " << MI);
457 if (Reg.isVirtual()) {
458 LiveRange &LR = LIS->getInterval(Reg);
459 markDefs(MI, LR, VirtRegOrUnit(Reg), Op.getSubReg(), Flag, Worklist);
460 } else {
461 // Handle physical registers that we need to track; this is mostly relevant
462 // for VCC, which can appear as the (implicit) input of a uniform branch,
463 // e.g. when a loop counter is stored in a VGPR.
464 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
465 LiveRange &LR = LIS->getRegUnit(Unit);
466 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
467 if (Value)
468 markDefs(MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag,
469 Worklist);
470 }
471 }
472}
473
474/// Mark all instructions defining the uses in \p MI with \p Flag.
475void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
476 std::vector<WorkItem> &Worklist) {
477 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
478 << MI);
479
480 for (const MachineOperand &Use : MI.all_uses())
481 markOperand(MI, Use, Flag, Worklist);
482}
483
484// Scan instructions to determine which ones require an Exact execmask and
485// which ones seed WQM requirements.
486char SIWholeQuadMode::scanInstructions(
487 MachineFunction &MF, std::vector<WorkItem> &Worklist,
488 SmallVector<MachineInstr *> &ExeczSideEffectInstrs) {
489 char GlobalFlags = 0;
490 bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
491 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
492 bool HasImplicitDerivatives =
493 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
494
495 // We need to visit the basic blocks in reverse post-order so that we visit
496 // defs before uses, in particular so that we don't accidentally mark an
497 // instruction as needing e.g. WQM before visiting it and realizing it needs
498 // WQM disabled.
499 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
500 for (MachineBasicBlock *MBB : RPOT) {
501 BlockInfo &BBI = Blocks[MBB];
502
503 for (MachineInstr &MI : *MBB) {
504 InstrInfo &III = Instructions[&MI];
505 unsigned Opcode = MI.getOpcode();
506 char Flags = 0;
507
508 if (TII->isWQM(Opcode)) {
509 // If LOD is not supported WQM is not needed.
510 // Only generate implicit WQM if implicit derivatives are required.
511 // This avoids inserting unintended WQM if a shader type without
512 // implicit derivatives uses an image sampling instruction.
513 if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
514 // Sampling instructions don't need to produce results for all pixels
515 // in a quad, they just require all inputs of a quad to have been
516 // computed for derivatives.
517 markInstructionUses(MI, StateWQM, Worklist);
518 GlobalFlags |= StateWQM;
519 }
520 } else if (Opcode == AMDGPU::WQM) {
521 // The WQM intrinsic requires its output to have all the helper lanes
522 // correct, so we need it to be in WQM.
523 Flags = StateWQM;
524 LowerToCopyInstrs.insert(&MI);
525 } else if (Opcode == AMDGPU::SOFT_WQM) {
526 LowerToCopyInstrs.insert(&MI);
527 SoftWQMInstrs.push_back(&MI);
528 } else if (Opcode == AMDGPU::STRICT_WWM) {
529 // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
530 // it needs to be executed in WQM or Exact so that its copy doesn't
531 // clobber inactive lanes.
532 markInstructionUses(MI, StateStrictWWM, Worklist);
533 GlobalFlags |= StateStrictWWM;
534 LowerToMovInstrs.push_back(&MI);
535 } else if (Opcode == AMDGPU::STRICT_WQM ||
536 TII->isDualSourceBlendEXP(MI)) {
537 // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
538 // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
539 // quads that have at least one active thread.
540 markInstructionUses(MI, StateStrictWQM, Worklist);
541 GlobalFlags |= StateStrictWQM;
542
543 if (Opcode == AMDGPU::STRICT_WQM) {
544 LowerToMovInstrs.push_back(&MI);
545 } else {
546 // Dual source blend export acts as implicit strict-wqm, its sources
547 // need to be shuffled in strict wqm, but the export itself needs to
548 // run in exact mode.
549 BBI.Needs |= StateExact;
550 if (!(BBI.InNeeds & StateExact)) {
551 BBI.InNeeds |= StateExact;
552 Worklist.emplace_back(MBB);
553 }
554 GlobalFlags |= StateExact;
555 III.Disabled = StateWQM | StateStrict;
556 }
557 } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
558 Opcode == AMDGPU::DS_PARAM_LOAD ||
559 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
560 Opcode == AMDGPU::DS_DIRECT_LOAD) {
561 // Mark these STRICTWQM, but only for the instruction, not its operands.
562 // This avoid unnecessarily marking M0 as requiring WQM.
563 III.Needs |= StateStrictWQM;
564 GlobalFlags |= StateStrictWQM;
565 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
566 // Disable strict states; StrictWQM will be added as required later.
567 III.Disabled = StateStrict;
568 MachineOperand &Inactive = MI.getOperand(4);
569 if (Inactive.isReg()) {
570 if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
571 LowerToCopyInstrs.insert(&MI);
572 else
573 markOperand(MI, Inactive, StateStrictWWM, Worklist);
574 }
575 SetInactiveInstrs.push_back(&MI);
576 BBI.NeedsLowering = true;
577 } else if (TII->isDisableWQM(MI)) {
578 BBI.Needs |= StateExact;
579 if (!(BBI.InNeeds & StateExact)) {
580 BBI.InNeeds |= StateExact;
581 Worklist.emplace_back(MBB);
582 }
583 GlobalFlags |= StateExact;
584 III.Disabled = StateWQM | StateStrict;
585 } else if (Opcode == AMDGPU::SI_PS_LIVE ||
586 Opcode == AMDGPU::SI_LIVE_MASK) {
587 LiveMaskQueries.push_back(&MI);
588 } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
589 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
590 Opcode == AMDGPU::SI_DEMOTE_I1) {
591 KillInstrs.push_back(&MI);
592 BBI.NeedsLowering = true;
593 } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
594 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
595 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
596 InitExecInstrs.push_back(&MI);
597 } else if (WQMOutputs) {
598 // The function is in machine SSA form, which means that physical
599 // VGPRs correspond to shader inputs and outputs. Inputs are
600 // only used, outputs are only defined.
601 // FIXME: is this still valid?
602 for (const MachineOperand &MO : MI.defs()) {
603 Register Reg = MO.getReg();
604 if (Reg.isPhysical() &&
605 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
606 Flags = StateWQM;
607 break;
608 }
609 }
610 }
611
612 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) {
613 for (auto &Op : MI.uses()) {
614 if (!Op.isReg())
615 continue;
616 if (!TRI->isVectorRegister(*MRI, Op.getReg()))
617 continue;
618
619 ExeczSideEffectInstrs.push_back(&MI);
620 break;
621 }
622 }
623
624 if (Flags) {
625 markInstruction(MI, Flags, Worklist);
626 GlobalFlags |= Flags;
627 }
628 }
629 }
630
631 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
632 // ever used anywhere in the function. This implements the corresponding
633 // semantics of @llvm.amdgcn.set.inactive.
634 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
635 if (GlobalFlags & StateWQM) {
636 for (MachineInstr *MI : SetInactiveInstrs)
637 markInstruction(*MI, StateWQM, Worklist);
638 for (MachineInstr *MI : SoftWQMInstrs)
639 markInstruction(*MI, StateWQM, Worklist);
640 }
641
642 return GlobalFlags;
643}
644
645void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
646 std::vector<WorkItem>& Worklist) {
647 MachineBasicBlock *MBB = MI.getParent();
648 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
649 BlockInfo &BI = Blocks[MBB];
650
651 // Control flow-type instructions and stores to temporary memory that are
652 // followed by WQM computations must themselves be in WQM.
653 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
654 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
655 Instructions[&MI].Needs = StateWQM;
656 II.Needs = StateWQM;
657 }
658
659 // Propagate to block level
660 if (II.Needs & StateWQM) {
661 BI.Needs |= StateWQM;
662 if (!(BI.InNeeds & StateWQM)) {
663 BI.InNeeds |= StateWQM;
664 Worklist.emplace_back(MBB);
665 }
666 }
667
668 // Propagate backwards within block
669 if (MachineInstr *PrevMI = MI.getPrevNode()) {
670 char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
671 if (!PrevMI->isPHI()) {
672 InstrInfo &PrevII = Instructions[PrevMI];
673 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
674 PrevII.OutNeeds |= InNeeds;
675 Worklist.emplace_back(PrevMI);
676 }
677 }
678 }
679
680 // Propagate WQM flag to instruction inputs
681 assert(!(II.Needs & StateExact));
682
683 if (II.Needs != 0)
684 markInstructionUses(MI, II.Needs, Worklist);
685
686 // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
687 // not require any WQM transitions.
688 if (II.Needs & StateStrictWWM)
689 BI.Needs |= StateStrictWWM;
690 if (II.Needs & StateStrictWQM)
691 BI.Needs |= StateStrictWQM;
692}
693
694void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
695 std::vector<WorkItem>& Worklist) {
696 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
697
698 // Propagate through instructions
699 if (!MBB.empty()) {
700 MachineInstr *LastMI = &*MBB.rbegin();
701 InstrInfo &LastII = Instructions[LastMI];
702 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
703 LastII.OutNeeds |= BI.OutNeeds;
704 Worklist.emplace_back(LastMI);
705 }
706 }
707
708 // Predecessor blocks must provide for our WQM/Exact needs.
709 for (MachineBasicBlock *Pred : MBB.predecessors()) {
710 BlockInfo &PredBI = Blocks[Pred];
711 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
712 continue;
713
714 PredBI.OutNeeds |= BI.InNeeds;
715 PredBI.InNeeds |= BI.InNeeds;
716 Worklist.emplace_back(Pred);
717 }
718
719 // All successors must be prepared to accept the same set of WQM/Exact data.
720 for (MachineBasicBlock *Succ : MBB.successors()) {
721 BlockInfo &SuccBI = Blocks[Succ];
722 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
723 continue;
724
725 SuccBI.InNeeds |= BI.OutNeeds;
726 Worklist.emplace_back(Succ);
727 }
728}
729
730char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
731 std::vector<WorkItem> Worklist;
732 SmallVector<MachineInstr *> ExeczSideEffectInstrs;
733 char GlobalFlags = scanInstructions(MF, Worklist, ExeczSideEffectInstrs);
734
735 while (!Worklist.empty()) {
736 WorkItem WI = Worklist.back();
737 Worklist.pop_back();
738
739 if (WI.MI)
740 propagateInstruction(*WI.MI, Worklist);
741 else
742 propagateBlock(*WI.MBB, Worklist);
743
744 if (Worklist.empty()) {
745 // Currently we let the instructions having sideeffect when execz to run
746 // under wqm, this avoids unwanted side-effect with exact mode if only
747 // helper lanes execute the parent block. At the same time, the wqm
748 // property should be back-propagated along the data-flow of their sources
749 // to ensure their sources have correct data for helper lanes.
750 for (auto *MI : ExeczSideEffectInstrs) {
751 InstrInfo II = Instructions[MI];
752 if (II.OutNeeds & StateWQM)
753 markInstructionUses(*MI, StateWQM, Worklist);
754 }
755 // The side-effect backward propagation should not expand the wqm-region.
756 // So we only need to run the propagation once.
757 ExeczSideEffectInstrs.clear();
758 }
759 }
760
761 return GlobalFlags;
762}
763
765SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
767 Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
768
769 MachineInstr *Save =
770 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
771 .addReg(AMDGPU::SCC);
772 MachineInstr *Restore =
773 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
774 .addReg(SaveReg);
775
776 LIS->InsertMachineInstrInMaps(*Save);
777 LIS->InsertMachineInstrInMaps(*Restore);
779
780 return Restore;
781}
782
783void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
784 MachineBasicBlock *BB = TermMI->getParent();
785 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
786 << *TermMI << "\n");
787
788 MachineBasicBlock *SplitBB =
789 BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
790
791 // Convert last instruction in block to a terminator.
792 // Note: this only covers the expected patterns
793 unsigned NewOpcode = 0;
794 switch (TermMI->getOpcode()) {
795 case AMDGPU::S_AND_B32:
796 NewOpcode = AMDGPU::S_AND_B32_term;
797 break;
798 case AMDGPU::S_AND_B64:
799 NewOpcode = AMDGPU::S_AND_B64_term;
800 break;
801 case AMDGPU::S_MOV_B32:
802 NewOpcode = AMDGPU::S_MOV_B32_term;
803 break;
804 case AMDGPU::S_MOV_B64:
805 NewOpcode = AMDGPU::S_MOV_B64_term;
806 break;
807 case AMDGPU::S_ANDN2_B32:
808 NewOpcode = AMDGPU::S_ANDN2_B32_term;
809 break;
810 case AMDGPU::S_ANDN2_B64:
811 NewOpcode = AMDGPU::S_ANDN2_B64_term;
812 break;
813 default:
814 llvm_unreachable("Unexpected instruction");
815 }
816
817 // These terminators fallthrough to the next block, no need to add an
818 // unconditional branch to the next block (SplitBB).
819 TermMI->setDesc(TII->get(NewOpcode));
820
821 if (SplitBB != BB) {
822 // Update dominator trees
823 using DomTreeT = DomTreeBase<MachineBasicBlock>;
825 for (MachineBasicBlock *Succ : SplitBB->successors()) {
826 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
827 DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
828 }
829 DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
830 if (MDT)
831 MDT->applyUpdates(DTUpdates);
832 if (PDT)
833 PDT->applyUpdates(DTUpdates);
834 }
835}
836
837MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
838 assert(LiveMaskReg.isVirtual());
839
840 const DebugLoc &DL = MI.getDebugLoc();
841 unsigned Opcode = 0;
842
843 assert(MI.getOperand(0).isReg());
844
845 // Comparison is for live lanes; however here we compute the inverse
846 // (killed lanes). This is because VCMP will always generate 0 bits
847 // for inactive lanes so a mask of live lanes would not be correct
848 // inside control flow.
849 // Invert the comparison by swapping the operands and adjusting
850 // the comparison codes.
851
852 switch (MI.getOperand(2).getImm()) {
853 case ISD::SETUEQ:
854 Opcode = AMDGPU::V_CMP_LG_F32_e64;
855 break;
856 case ISD::SETUGT:
857 Opcode = AMDGPU::V_CMP_GE_F32_e64;
858 break;
859 case ISD::SETUGE:
860 Opcode = AMDGPU::V_CMP_GT_F32_e64;
861 break;
862 case ISD::SETULT:
863 Opcode = AMDGPU::V_CMP_LE_F32_e64;
864 break;
865 case ISD::SETULE:
866 Opcode = AMDGPU::V_CMP_LT_F32_e64;
867 break;
868 case ISD::SETUNE:
869 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
870 break;
871 case ISD::SETO:
872 Opcode = AMDGPU::V_CMP_O_F32_e64;
873 break;
874 case ISD::SETUO:
875 Opcode = AMDGPU::V_CMP_U_F32_e64;
876 break;
877 case ISD::SETOEQ:
878 case ISD::SETEQ:
879 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
880 break;
881 case ISD::SETOGT:
882 case ISD::SETGT:
883 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
884 break;
885 case ISD::SETOGE:
886 case ISD::SETGE:
887 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
888 break;
889 case ISD::SETOLT:
890 case ISD::SETLT:
891 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
892 break;
893 case ISD::SETOLE:
894 case ISD::SETLE:
895 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
896 break;
897 case ISD::SETONE:
898 case ISD::SETNE:
899 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
900 break;
901 default:
902 llvm_unreachable("invalid ISD:SET cond code");
903 }
904
905 MachineBasicBlock &MBB = *MI.getParent();
906
907 // Pick opcode based on comparison type.
908 MachineInstr *VcmpMI;
909 const MachineOperand &Op0 = MI.getOperand(0);
910 const MachineOperand &Op1 = MI.getOperand(1);
911
912 // VCC represents lanes killed.
913 if (TRI->isVGPR(*MRI, Op0.getReg())) {
914 Opcode = AMDGPU::getVOPe32(Opcode);
915 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
916 } else {
917 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
918 .addReg(LMC.VccReg, RegState::Define)
919 .addImm(0) // src0 modifiers
920 .add(Op1)
921 .addImm(0) // src1 modifiers
922 .add(Op0)
923 .addImm(0); // omod
924 }
925
926 MachineInstr *MaskUpdateMI =
927 BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
928 .addReg(LiveMaskReg)
929 .addReg(LMC.VccReg);
930
931 // State of SCC represents whether any lanes are live in mask,
932 // if SCC is 0 then no lanes will be alive anymore.
933 MachineInstr *EarlyTermMI =
934 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
935
936 MachineInstr *ExecMaskMI =
937 BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LMC.ExecReg)
938 .addReg(LMC.ExecReg)
939 .addReg(LMC.VccReg);
940
941 assert(MBB.succ_size() == 1);
942
943 // Update live intervals
944 LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
945 MBB.remove(&MI);
946
947 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
948 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
949 LIS->InsertMachineInstrInMaps(*ExecMaskMI);
950
951 return ExecMaskMI;
952}
953
954MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
955 assert(LiveMaskReg.isVirtual());
956
957 MachineBasicBlock &MBB = *MI.getParent();
958
959 const DebugLoc &DL = MI.getDebugLoc();
960 MachineInstr *MaskUpdateMI = nullptr;
961
962 const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
963 const MachineOperand &Op = MI.getOperand(0);
964 int64_t KillVal = MI.getOperand(1).getImm();
965 MachineInstr *ComputeKilledMaskMI = nullptr;
966 Register CndReg = !Op.isImm() ? Op.getReg() : Register();
967 Register TmpReg;
968
969 // Is this a static or dynamic kill?
970 if (Op.isImm()) {
971 if (Op.getImm() == KillVal) {
972 // Static: all active lanes are killed
973 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
974 .addReg(LiveMaskReg)
975 .addReg(LMC.ExecReg);
976 } else {
977 // Static: kill does nothing
978 bool IsLastTerminator = std::next(MI.getIterator()) == MBB.end();
979 if (!IsLastTerminator) {
981 } else {
982 assert(MBB.succ_size() == 1 && MI.getOpcode() != AMDGPU::SI_DEMOTE_I1);
983 MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
984 .addMBB(*MBB.succ_begin());
985 LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
986 }
987 MBB.remove(&MI);
988 return nullptr;
989 }
990 } else {
991 if (!KillVal) {
992 // Op represents live lanes after kill,
993 // so exec mask needs to be factored in.
994 TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
995 ComputeKilledMaskMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), TmpReg)
996 .addReg(LMC.ExecReg)
997 .add(Op);
998 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
999 .addReg(LiveMaskReg)
1000 .addReg(TmpReg);
1001 } else {
1002 // Op represents lanes to kill
1003 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
1004 .addReg(LiveMaskReg)
1005 .add(Op);
1006 }
1007 }
1008
1009 // State of SCC represents whether any lanes are live in mask,
1010 // if SCC is 0 then no lanes will be alive anymore.
1011 MachineInstr *EarlyTermMI =
1012 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
1013
1014 // In the case we got this far some lanes are still live,
1015 // update EXEC to deactivate lanes as appropriate.
1016 MachineInstr *NewTerm;
1017 MachineInstr *WQMMaskMI = nullptr;
1018 Register LiveMaskWQM;
1019 if (IsDemote) {
1020 // Demote - deactivate quads with only helper lanes
1021 LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
1022 WQMMaskMI = BuildMI(MBB, MI, DL, TII->get(LMC.WQMOpc), LiveMaskWQM)
1023 .addReg(LiveMaskReg);
1024 NewTerm = BuildMI(MBB, MI, DL, TII->get(LMC.AndOpc), LMC.ExecReg)
1025 .addReg(LMC.ExecReg)
1026 .addReg(LiveMaskWQM);
1027 } else {
1028 // Kill - deactivate lanes no longer in live mask
1029 if (Op.isImm()) {
1030 NewTerm =
1031 BuildMI(MBB, &MI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(0);
1032 } else if (!IsWQM) {
1033 NewTerm = BuildMI(MBB, &MI, DL, TII->get(LMC.AndOpc), LMC.ExecReg)
1034 .addReg(LMC.ExecReg)
1035 .addReg(LiveMaskReg);
1036 } else {
1037 unsigned Opcode = KillVal ? LMC.AndN2Opc : LMC.AndOpc;
1038 NewTerm = BuildMI(MBB, &MI, DL, TII->get(Opcode), LMC.ExecReg)
1039 .addReg(LMC.ExecReg)
1040 .add(Op);
1041 }
1042 }
1043
1044 // Update live intervals
1046 MBB.remove(&MI);
1047 assert(EarlyTermMI);
1048 assert(MaskUpdateMI);
1049 assert(NewTerm);
1050 if (ComputeKilledMaskMI)
1051 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1052 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1053 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1054 if (WQMMaskMI)
1055 LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1056 LIS->InsertMachineInstrInMaps(*NewTerm);
1057
1058 if (CndReg) {
1059 LIS->removeInterval(CndReg);
1061 }
1062 if (TmpReg)
1064 if (LiveMaskWQM)
1065 LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1066
1067 return NewTerm;
1068}
1069
1070// Replace (or supplement) instructions accessing live mask.
1071// This can only happen once all the live mask registers have been created
1072// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1073void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) {
1074 if (!BI.NeedsLowering)
1075 return;
1076
1077 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1078
1079 SmallVector<MachineInstr *, 4> SplitPoints;
1080 Register ActiveLanesReg = 0;
1081 char State = BI.InitialState;
1082
1083 for (MachineInstr &MI : llvm::make_early_inc_range(
1085 auto MIState = StateTransition.find(&MI);
1086 if (MIState != StateTransition.end())
1087 State = MIState->second;
1088
1089 MachineInstr *SplitPoint = nullptr;
1090 switch (MI.getOpcode()) {
1091 case AMDGPU::SI_DEMOTE_I1:
1092 case AMDGPU::SI_KILL_I1_TERMINATOR:
1093 SplitPoint = lowerKillI1(MI, State == StateWQM);
1094 break;
1095 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1096 SplitPoint = lowerKillF32(MI);
1097 break;
1098 case AMDGPU::ENTER_STRICT_WWM:
1099 ActiveLanesReg = MI.getOperand(0).getReg();
1100 break;
1101 case AMDGPU::EXIT_STRICT_WWM:
1102 ActiveLanesReg = 0;
1103 break;
1104 case AMDGPU::V_SET_INACTIVE_B32:
1105 if (ActiveLanesReg) {
1106 LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
1107 MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
1108 MI.getOperand(5).setReg(ActiveLanesReg);
1109 LIS->shrinkToUses(&LI);
1110 } else {
1111 assert(State == StateExact || State == StateWQM);
1112 }
1113 break;
1114 default:
1115 break;
1116 }
1117 if (SplitPoint)
1118 SplitPoints.push_back(SplitPoint);
1119 }
1120
1121 // Perform splitting after instruction scan to simplify iteration.
1122 for (MachineInstr *MI : SplitPoints)
1123 splitBlock(MI);
1124}
1125
1126// Return an iterator in the (inclusive) range [First, Last] at which
1127// instructions can be safely inserted, keeping in mind that some of the
1128// instructions we want to add necessarily clobber SCC.
1129MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1130 MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1131 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1132 if (!SaveSCC)
1133 return PreferLast ? Last : First;
1134
1135 LiveRange &LR =
1136 LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1137 auto MBBE = MBB.end();
1138 // Skip debug instructions when getting slot indices, as they don't have
1139 // entries in the slot index map.
1140 auto FirstNonDbg = skipDebugInstructionsForward(First, MBBE);
1141 auto LastNonDbg = skipDebugInstructionsForward(Last, MBBE);
1142 SlotIndex FirstIdx = FirstNonDbg != MBBE
1143 ? LIS->getInstructionIndex(*FirstNonDbg)
1144 : LIS->getMBBEndIdx(&MBB);
1145 SlotIndex LastIdx = LastNonDbg != MBBE ? LIS->getInstructionIndex(*LastNonDbg)
1146 : LIS->getMBBEndIdx(&MBB);
1147 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1148 const LiveRange::Segment *S;
1149
1150 for (;;) {
1151 S = LR.getSegmentContaining(Idx);
1152 if (!S)
1153 break;
1154
1155 if (PreferLast) {
1156 SlotIndex Next = S->start.getBaseIndex();
1157 if (Next < FirstIdx)
1158 break;
1159 Idx = Next;
1160 } else {
1161 MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1162 assert(EndMI && "Segment does not end on valid instruction");
1163 auto NextI = next_nodbg(EndMI->getIterator(), MBB.instr_end());
1164 if (NextI == MBB.instr_end())
1165 break;
1166 SlotIndex Next = LIS->getInstructionIndex(*NextI);
1167 if (Next > LastIdx)
1168 break;
1169 Idx = Next;
1170 }
1171 }
1172
1174
1175 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1176 MBBI = MI;
1177 else {
1178 assert(Idx == LIS->getMBBEndIdx(&MBB));
1179 MBBI = MBB.end();
1180 }
1181
1182 // Move insertion point past any operations modifying EXEC.
1183 // This assumes that the value of SCC defined by any of these operations
1184 // does not need to be preserved.
1185 while (MBBI != Last) {
1186 bool IsExecDef = false;
1187 for (const MachineOperand &MO : MBBI->all_defs()) {
1188 IsExecDef |=
1189 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1190 }
1191 if (!IsExecDef)
1192 break;
1193 MBBI++;
1194 S = nullptr;
1195 }
1196
1197 if (S)
1198 MBBI = saveSCC(MBB, MBBI);
1199
1200 return MBBI;
1201}
1202
1203void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1205 Register SaveWQM) {
1206 assert(LiveMaskReg.isVirtual());
1207
1208 bool IsTerminator = Before == MBB.end();
1209 if (!IsTerminator) {
1210 auto FirstTerm = MBB.getFirstTerminator();
1211 if (FirstTerm != MBB.end()) {
1212 SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1213 SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1214 IsTerminator = BeforeIdx > FirstTermIdx;
1215 }
1216 }
1217
1218 const DebugLoc &DL = MBB.findDebugLoc(Before);
1219 MachineInstr *MI;
1220
1221 if (SaveWQM) {
1222 unsigned Opcode =
1223 IsTerminator ? LMC.AndSaveExecTermOpc : LMC.AndSaveExecOpc;
1224 MI =
1225 BuildMI(MBB, Before, DL, TII->get(Opcode), SaveWQM).addReg(LiveMaskReg);
1226 } else {
1227 unsigned Opcode = IsTerminator ? LMC.AndTermOpc : LMC.AndOpc;
1228 MI = BuildMI(MBB, Before, DL, TII->get(Opcode), LMC.ExecReg)
1229 .addReg(LMC.ExecReg)
1230 .addReg(LiveMaskReg);
1231 }
1232
1234 StateTransition[MI] = StateExact;
1235}
1236
1237void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1239 Register SavedWQM) {
1240 const DebugLoc &DL = MBB.findDebugLoc(Before);
1241 MachineInstr *MI;
1242
1243 if (SavedWQM) {
1244 MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::COPY), LMC.ExecReg)
1245 .addReg(SavedWQM);
1246 } else {
1247 MI = BuildMI(MBB, Before, DL, TII->get(LMC.WQMOpc), LMC.ExecReg)
1248 .addReg(LMC.ExecReg);
1249 }
1250
1252 StateTransition[MI] = StateWQM;
1253}
1254
1255void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1257 Register SaveOrig, char StrictStateNeeded) {
1258 MachineInstr *MI;
1259 assert(SaveOrig);
1260 assert(StrictStateNeeded == StateStrictWWM ||
1261 StrictStateNeeded == StateStrictWQM);
1262
1263 const DebugLoc &DL = MBB.findDebugLoc(Before);
1264
1265 if (StrictStateNeeded == StateStrictWWM) {
1266 MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::ENTER_STRICT_WWM), SaveOrig)
1267 .addImm(-1);
1268 } else {
1269 MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::ENTER_STRICT_WQM), SaveOrig)
1270 .addImm(-1);
1271 }
1273 StateTransition[MI] = StrictStateNeeded;
1274}
1275
1276void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1278 Register SavedOrig, char NonStrictState,
1279 char CurrentStrictState) {
1280 MachineInstr *MI;
1281
1282 assert(SavedOrig);
1283 assert(CurrentStrictState == StateStrictWWM ||
1284 CurrentStrictState == StateStrictWQM);
1285
1286 const DebugLoc &DL = MBB.findDebugLoc(Before);
1287
1288 if (CurrentStrictState == StateStrictWWM) {
1289 MI =
1290 BuildMI(MBB, Before, DL, TII->get(AMDGPU::EXIT_STRICT_WWM), LMC.ExecReg)
1291 .addReg(SavedOrig);
1292 } else {
1293 MI =
1294 BuildMI(MBB, Before, DL, TII->get(AMDGPU::EXIT_STRICT_WQM), LMC.ExecReg)
1295 .addReg(SavedOrig);
1296 }
1298 StateTransition[MI] = NonStrictState;
1299}
1300
1301void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI,
1302 bool IsEntry) {
1303 // This is a non-entry block that is WQM throughout, so no need to do
1304 // anything.
1305 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1306 BI.InitialState = StateWQM;
1307 return;
1308 }
1309
1310 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1311 << ":\n");
1312
1313 Register SavedWQMReg;
1314 Register SavedNonStrictReg;
1315 bool WQMFromExec = IsEntry;
1316 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1317 char NonStrictState = 0;
1318 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1319
1320 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1321 if (IsEntry) {
1322 // Skip the instruction that saves LiveMask
1323 if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1324 II->getOperand(1).getReg() == LMC.ExecReg)
1325 ++II;
1326 }
1327
1328 // This stores the first instruction where it's safe to switch from WQM to
1329 // Exact or vice versa.
1331
1332 // This stores the first instruction where it's safe to switch from Strict
1333 // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1334 // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1335 // be safe to switch to/from WQM as well.
1336 MachineBasicBlock::iterator FirstStrict = IE;
1337
1338 // Record initial state is block information.
1339 BI.InitialState = State;
1340
1341 for (unsigned Idx = 0;; ++Idx) {
1343 char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1344 char OutNeeds = 0;
1345
1346 if (FirstWQM == IE)
1347 FirstWQM = II;
1348
1349 if (FirstStrict == IE)
1350 FirstStrict = II;
1351
1352 // Adjust needs if this is first instruction of WQM requiring shader.
1353 if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1354 Needs = StateWQM;
1355
1356 // First, figure out the allowed states (Needs) based on the propagated
1357 // flags.
1358 if (II != IE) {
1359 MachineInstr &MI = *II;
1360
1361 if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1362 auto III = Instructions.find(&MI);
1363 if (III != Instructions.end()) {
1364 if (III->second.Needs & StateStrictWWM)
1365 Needs = StateStrictWWM;
1366 else if (III->second.Needs & StateStrictWQM)
1367 Needs = StateStrictWQM;
1368 else if (III->second.Needs & StateWQM)
1369 Needs = StateWQM;
1370 else
1371 Needs &= ~III->second.Disabled;
1372 OutNeeds = III->second.OutNeeds;
1373 }
1374 } else {
1375 // If the instruction doesn't actually need a correct EXEC, then we can
1376 // safely leave Strict mode enabled.
1377 Needs = StateExact | StateWQM | StateStrict;
1378 }
1379
1380 // Exact mode exit can occur in terminators, but must be before branches.
1381 if (MI.isBranch() && OutNeeds == StateExact)
1382 Needs = StateExact;
1383
1384 ++Next;
1385 } else {
1386 // End of basic block
1387 if (BI.OutNeeds & StateWQM)
1388 Needs = StateWQM;
1389 else if (BI.OutNeeds == StateExact)
1390 Needs = StateExact;
1391 else
1392 Needs = StateWQM | StateExact;
1393 }
1394
1395 // Now, transition if necessary.
1396 if (!(Needs & State)) {
1398 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1399 State == StateStrictWQM || Needs == StateStrictWQM) {
1400 // We must switch to or from Strict mode.
1401 First = FirstStrict;
1402 } else {
1403 // We only need to switch to/from WQM, so we can use FirstWQM.
1404 First = FirstWQM;
1405 }
1406
1407 // Whether we need to save SCC depends on start and end states.
1408 bool SaveSCC = false;
1409 switch (State) {
1410 case StateExact:
1411 case StateStrictWWM:
1412 case StateStrictWQM:
1413 // Exact/Strict -> Strict: save SCC
1414 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1415 // Exact/Strict -> Exact: no save
1416 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1417 break;
1418 case StateWQM:
1419 // WQM -> Exact/Strict: save SCC
1420 SaveSCC = !(Needs & StateWQM);
1421 break;
1422 default:
1423 llvm_unreachable("Unknown state");
1424 break;
1425 }
1426 char StartState = State & StateStrict ? NonStrictState : State;
1427 bool WQMToExact =
1428 StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1429 bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1430 !(Needs & StateExact);
1431 bool PreferLast = Needs == StateWQM;
1432 // Exact regions in divergent control flow may run at EXEC=0, so try to
1433 // exclude instructions with unexpected effects from them.
1434 // FIXME: ideally we would branch over these when EXEC=0,
1435 // but this requires updating implicit values, live intervals and CFG.
1436 if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1437 for (MachineBasicBlock::iterator I = First; I != II; ++I) {
1438 if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) {
1439 PreferLast = WQMToExact;
1440 break;
1441 }
1442 }
1443 }
1445 prepareInsertion(MBB, First, II, PreferLast, SaveSCC);
1446
1447 if (State & StateStrict) {
1448 assert(State == StateStrictWWM || State == StateStrictWQM);
1449 assert(SavedNonStrictReg);
1450 fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1451
1452 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1453 SavedNonStrictReg = 0;
1454 State = NonStrictState;
1455 }
1456
1457 if (Needs & StateStrict) {
1458 NonStrictState = State;
1459 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1460 assert(!SavedNonStrictReg);
1461 SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1462
1463 toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1464 State = Needs;
1465 } else {
1466 if (WQMToExact) {
1467 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1468 assert(!SavedWQMReg);
1469 SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1470 }
1471
1472 toExact(MBB, Before, SavedWQMReg);
1473 State = StateExact;
1474 } else if (ExactToWQM) {
1475 assert(WQMFromExec == (SavedWQMReg == 0));
1476
1477 toWQM(MBB, Before, SavedWQMReg);
1478
1479 if (SavedWQMReg) {
1480 LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1481 SavedWQMReg = 0;
1482 }
1483 State = StateWQM;
1484 } else {
1485 // We can get here if we transitioned from StrictWWM to a
1486 // non-StrictWWM state that already matches our needs, but we
1487 // shouldn't need to do anything.
1488 assert(Needs & State);
1489 }
1490 }
1491 }
1492
1493 if (Needs != (StateExact | StateWQM | StateStrict)) {
1494 if (Needs != (StateExact | StateWQM))
1495 FirstWQM = IE;
1496 FirstStrict = IE;
1497 }
1498
1499 if (II == IE)
1500 break;
1501
1502 II = Next;
1503 }
1504 assert(!SavedWQMReg);
1505 assert(!SavedNonStrictReg);
1506}
1507
1508bool SIWholeQuadMode::lowerLiveMaskQueries() {
1509 for (MachineInstr *MI : LiveMaskQueries) {
1510 const DebugLoc &DL = MI->getDebugLoc();
1511 Register Dest = MI->getOperand(0).getReg();
1512
1513 MachineInstr *Copy =
1514 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1515 .addReg(LiveMaskReg);
1516
1517 LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1518 MI->eraseFromParent();
1519 }
1520 return !LiveMaskQueries.empty();
1521}
1522
1523bool SIWholeQuadMode::lowerCopyInstrs() {
1524 for (MachineInstr *MI : LowerToMovInstrs) {
1525 assert(MI->getNumExplicitOperands() == 2);
1526
1527 const Register Reg = MI->getOperand(0).getReg();
1528
1529 const TargetRegisterClass *regClass =
1530 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1531 if (TRI->isVGPRClass(regClass)) {
1532 const unsigned MovOp = TII->getMovOpcode(regClass);
1533 MI->setDesc(TII->get(MovOp));
1534
1535 // Check that it already implicitly depends on exec (like all VALU movs
1536 // should do).
1537 assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1538 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1539 }));
1540 } else {
1541 // Remove early-clobber and exec dependency from simple SGPR copies.
1542 // This allows some to be eliminated during/post RA.
1543 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1544 if (MI->getOperand(0).isEarlyClobber()) {
1545 LIS->removeInterval(Reg);
1546 MI->getOperand(0).setIsEarlyClobber(false);
1548 }
1549 int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1550 while (Index >= 0) {
1551 MI->removeOperand(Index);
1552 Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1553 }
1554 MI->setDesc(TII->get(AMDGPU::COPY));
1555 LLVM_DEBUG(dbgs() << " -> " << *MI);
1556 }
1557 }
1558 for (MachineInstr *MI : LowerToCopyInstrs) {
1559 LLVM_DEBUG(dbgs() << "simplify: " << *MI);
1560
1561 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1562 assert(MI->getNumExplicitOperands() == 6);
1563
1564 LiveInterval *RecomputeLI = nullptr;
1565 if (MI->getOperand(4).isReg())
1566 RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
1567
1568 MI->removeOperand(5);
1569 MI->removeOperand(4);
1570 MI->removeOperand(3);
1571 MI->removeOperand(1);
1572
1573 if (RecomputeLI)
1574 LIS->shrinkToUses(RecomputeLI);
1575 } else {
1576 assert(MI->getNumExplicitOperands() == 2);
1577 }
1578
1579 unsigned CopyOp = MI->getOperand(1).isReg()
1580 ? (unsigned)AMDGPU::COPY
1581 : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1582 *MRI, MI->getOperand(0)));
1583 MI->setDesc(TII->get(CopyOp));
1584 LLVM_DEBUG(dbgs() << " -> " << *MI);
1585 }
1586 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1587}
1588
1589bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1590 for (MachineInstr *MI : KillInstrs) {
1591 MachineInstr *SplitPoint = nullptr;
1592 switch (MI->getOpcode()) {
1593 case AMDGPU::SI_DEMOTE_I1:
1594 case AMDGPU::SI_KILL_I1_TERMINATOR:
1595 SplitPoint = lowerKillI1(*MI, IsWQM);
1596 break;
1597 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1598 SplitPoint = lowerKillF32(*MI);
1599 break;
1600 }
1601 if (SplitPoint)
1602 splitBlock(SplitPoint);
1603 }
1604 return !KillInstrs.empty();
1605}
1606
1607void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1608 MachineBasicBlock *MBB = MI.getParent();
1609
1610 if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1611 assert(MBB == &MBB->getParent()->front() &&
1612 "init whole wave not in entry block");
1613 Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
1614 MachineInstr *SaveExec = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1615 TII->get(LMC.OrSaveExecOpc), EntryExec)
1616 .addImm(-1);
1617
1618 // Replace all uses of MI's destination reg with EntryExec.
1619 MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
1620
1621 if (LIS) {
1623 }
1624
1625 MI.eraseFromParent();
1626
1627 if (LIS) {
1628 LIS->InsertMachineInstrInMaps(*SaveExec);
1629 LIS->createAndComputeVirtRegInterval(EntryExec);
1630 }
1631 return;
1632 }
1633
1634 if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1635 // This should be before all vector instructions.
1636 MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1637 TII->get(LMC.MovOpc), LMC.ExecReg)
1638 .addImm(MI.getOperand(0).getImm());
1639 if (LIS) {
1641 LIS->InsertMachineInstrInMaps(*InitMI);
1642 }
1643 MI.eraseFromParent();
1644 return;
1645 }
1646
1647 // Extract the thread count from an SGPR input and set EXEC accordingly.
1648 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1649 //
1650 // S_BFE_U32 count, input, {shift, 7}
1651 // S_BFM_B64 exec, count, 0
1652 // S_CMP_EQ_U32 count, 64
1653 // S_CMOV_B64 exec, -1
1654 Register InputReg = MI.getOperand(0).getReg();
1655 MachineInstr *FirstMI = &*MBB->begin();
1656 if (InputReg.isVirtual()) {
1657 MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1658 assert(DefInstr && DefInstr->isCopy());
1659 if (DefInstr->getParent() == MBB) {
1660 if (DefInstr != FirstMI) {
1661 // If the `InputReg` is defined in current block, we also need to
1662 // move that instruction to the beginning of the block.
1663 DefInstr->removeFromParent();
1664 MBB->insert(FirstMI, DefInstr);
1665 if (LIS)
1666 LIS->handleMove(*DefInstr);
1667 } else {
1668 // If first instruction is definition then move pointer after it.
1669 FirstMI = &*std::next(FirstMI->getIterator());
1670 }
1671 }
1672 }
1673
1674 // Insert instruction sequence at block beginning (before vector operations).
1675 const DebugLoc &DL = MI.getDebugLoc();
1676 const unsigned WavefrontSize = ST->getWavefrontSize();
1677 const unsigned Mask = (WavefrontSize << 1) - 1;
1678 Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1679 auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1680 .addReg(InputReg)
1681 .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1682 auto BfmMI = BuildMI(*MBB, FirstMI, DL, TII->get(LMC.BfmOpc), LMC.ExecReg)
1683 .addReg(CountReg)
1684 .addImm(0);
1685 auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1686 .addReg(CountReg, RegState::Kill)
1687 .addImm(WavefrontSize);
1688 auto CmovMI =
1689 BuildMI(*MBB, FirstMI, DL, TII->get(LMC.CMovOpc), LMC.ExecReg).addImm(-1);
1690
1691 if (!LIS) {
1692 MI.eraseFromParent();
1693 return;
1694 }
1695
1697 MI.eraseFromParent();
1698
1699 LIS->InsertMachineInstrInMaps(*BfeMI);
1700 LIS->InsertMachineInstrInMaps(*BfmMI);
1701 LIS->InsertMachineInstrInMaps(*CmpMI);
1702 LIS->InsertMachineInstrInMaps(*CmovMI);
1703
1704 LIS->removeInterval(InputReg);
1705 LIS->createAndComputeVirtRegInterval(InputReg);
1706 LIS->createAndComputeVirtRegInterval(CountReg);
1707}
1708
1709/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1710/// for instructions that depend on EXEC.
1712SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1713 MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1714
1715 for (MachineInstr *MI : InitExecInstrs) {
1716 // Try to handle undefined cases gracefully:
1717 // - multiple INIT_EXEC instructions
1718 // - INIT_EXEC instructions not in the entry block
1719 if (MI->getParent() == &Entry)
1720 InsertPt = std::next(MI->getIterator());
1721
1722 lowerInitExec(*MI);
1723 Changed = true;
1724 }
1725
1726 return InsertPt;
1727}
1728
1729bool SIWholeQuadMode::run(MachineFunction &MF) {
1730 LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1731 << " ------------- \n");
1732 LLVM_DEBUG(MF.dump(););
1733
1734 Instructions.clear();
1735 Blocks.clear();
1736 LiveMaskQueries.clear();
1737 LowerToCopyInstrs.clear();
1738 LowerToMovInstrs.clear();
1739 KillInstrs.clear();
1740 InitExecInstrs.clear();
1741 SetInactiveInstrs.clear();
1742 StateTransition.clear();
1743
1744 const char GlobalFlags = analyzeFunction(MF);
1745 bool Changed = false;
1746
1747 LiveMaskReg = LMC.ExecReg;
1748
1749 MachineBasicBlock &Entry = MF.front();
1750 MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
1751
1752 // Store a copy of the original live mask when required
1753 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1754 const bool HasWaveModes = GlobalFlags & ~StateExact;
1755 const bool HasKills = !KillInstrs.empty();
1756 const bool UsesWQM = GlobalFlags & StateWQM;
1757 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1758 LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1759 MachineInstr *MI =
1760 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1761 .addReg(LMC.ExecReg);
1763 Changed = true;
1764 }
1765
1766 // Check if V_SET_INACTIVE was touched by a strict state mode.
1767 // If so, promote to WWM; otherwise lower to COPY.
1768 for (MachineInstr *MI : SetInactiveInstrs) {
1769 if (LowerToCopyInstrs.contains(MI))
1770 continue;
1771 auto &Info = Instructions[MI];
1772 if (Info.MarkedStates & StateStrict) {
1773 Info.Needs |= StateStrictWWM;
1774 Info.Disabled &= ~StateStrictWWM;
1775 Blocks[MI->getParent()].Needs |= StateStrictWWM;
1776 } else {
1777 LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
1778 LowerToCopyInstrs.insert(MI);
1779 }
1780 }
1781
1782 LLVM_DEBUG(printInfo());
1783
1784 Changed |= lowerLiveMaskQueries();
1785 Changed |= lowerCopyInstrs();
1786
1787 if (!HasWaveModes) {
1788 // No wave mode execution
1789 Changed |= lowerKillInstrs(false);
1790 } else if (GlobalFlags == StateWQM) {
1791 // Shader only needs WQM
1792 auto MI =
1793 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(LMC.WQMOpc), LMC.ExecReg)
1794 .addReg(LMC.ExecReg);
1796 lowerKillInstrs(true);
1797 Changed = true;
1798 } else {
1799 // Mark entry for WQM if required.
1800 if (GlobalFlags & StateWQM)
1801 Blocks[&Entry].InNeeds |= StateWQM;
1802 // Wave mode switching requires full lowering pass.
1803 for (auto &BII : Blocks)
1804 processBlock(*BII.first, BII.second, BII.first == &Entry);
1805 // Lowering blocks causes block splitting so perform as a second pass.
1806 for (auto &BII : Blocks)
1807 lowerBlock(*BII.first, BII.second);
1808 Changed = true;
1809 }
1810
1811 // Compute live range for live mask
1812 if (LiveMaskReg != LMC.ExecReg)
1813 LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1814
1815 // Physical registers like SCC aren't tracked by default anyway, so just
1816 // removing the ranges we computed is the simplest option for maintaining
1817 // the analysis results.
1818 LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1819
1820 // If we performed any kills then recompute EXEC
1821 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1822 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1823
1824 return Changed;
1825}
1826
1827bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) {
1828 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1829 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1830 MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1831 auto *PDTWrapper =
1832 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1833 MachinePostDominatorTree *PDT =
1834 PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1835 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1836 return Impl.run(MF);
1837}
1838
1839PreservedAnalyses
1842 MFPropsModifier _(*this, MF);
1843
1849 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1850 bool Changed = Impl.run(MF);
1851 if (!Changed)
1852 return PreservedAnalyses::all();
1853
1859 return PA;
1860}
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
Definition Mem2Reg.cpp:110
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT, MachineLoopInfo *MLI)
SI Optimize VGPR LiveRange
#define LLVM_DEBUG(...)
Definition Debug.h:114
unsigned getWavefrontSize() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:728
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
LLVM_ABI void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block.
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
This class represents the liveness of a register, stack slot, etc.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarily including Idx,...
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:77
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
LLVM_ABI MachineInstr * removeFromParent()
Unlink 'this' from the containing basic block, and return it without deleting it.
const MachineBasicBlock * getParent() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:176
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Wrapper class representing a virtual register or register unit.
Definition Register.h:181
constexpr bool isVirtualReg() const
Definition Register.h:197
constexpr Register asVirtualReg() const
Definition Register.h:206
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char WavefrontSize[]
Key for Kernel::CodeProps::Metadata::mWavefrontSize.
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
Flag
These should be considered private to the implementation of the MCInstrDesc class.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
DominatorTreeBase< T, false > DomTreeBase
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionPass * createSIWholeQuadModeLegacyPass()
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
char & SIWholeQuadModeID
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
@ Disabled
Don't do any conversion of .debug_str_offsets tables.
Definition DWP.h:31
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
WorkItem(const BasicBlock *BB, int St)
static constexpr LaneBitmask getAll()
Definition LaneBitmask.h:82
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81