LLVM 19.0.0git
SIWholeQuadMode.cpp
Go to the documentation of this file.
1//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass adds instructions to enable whole quad mode (strict or non-strict)
11/// for pixel shaders, and strict whole wavefront mode for all programs.
12///
13/// The "strict" prefix indicates that inactive lanes do not take part in
14/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15/// always be enabled irrespective of control flow decisions. Conversely in
16/// non-strict WQM inactive lanes may control flow decisions.
17///
18/// Whole quad mode is required for derivative computations, but it interferes
19/// with shader side effects (stores and atomics). It ensures that WQM is
20/// enabled when necessary, but disabled around stores and atomics.
21///
22/// When necessary, this pass creates a function prolog
23///
24/// S_MOV_B64 LiveMask, EXEC
25/// S_WQM_B64 EXEC, EXEC
26///
27/// to enter WQM at the top of the function and surrounds blocks of Exact
28/// instructions by
29///
30/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31/// ...
32/// S_MOV_B64 EXEC, Tmp
33///
34/// We also compute when a sequence of instructions requires strict whole
35/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36///
37/// S_OR_SAVEEXEC_B64 Tmp, -1
38/// ...
39/// S_MOV_B64 EXEC, Tmp
40///
41/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42/// we use a similar save and restore mechanism and force whole quad mode for
43/// those instructions:
44///
45/// S_MOV_B64 Tmp, EXEC
46/// S_WQM_B64 EXEC, EXEC
47/// ...
48/// S_MOV_B64 EXEC, Tmp
49///
50/// In order to avoid excessive switching during sequences of Exact
51/// instructions, the pass first analyzes which instructions must be run in WQM
52/// (aka which instructions produce values that lead to derivative
53/// computations).
54///
55/// Basic blocks are always exited in WQM as long as some successor needs WQM.
56///
57/// There is room for improvement given better control flow analysis:
58///
59/// (1) at the top level (outside of control flow statements, and as long as
60/// kill hasn't been used), one SGPR can be saved by recovering WQM from
61/// the LiveMask (this is implemented for the entry block).
62///
63/// (2) when entire regions (e.g. if-else blocks or entire loops) only
64/// consist of exact and don't-care instructions, the switch only has to
65/// be done at the entry and exit points rather than potentially in each
66/// block of the region.
67///
68//===----------------------------------------------------------------------===//
69
70#include "AMDGPU.h"
71#include "GCNSubtarget.h"
73#include "llvm/ADT/MapVector.h"
81#include "llvm/IR/CallingConv.h"
84
85using namespace llvm;
86
87#define DEBUG_TYPE "si-wqm"
88
89namespace {
90
91enum {
92 StateWQM = 0x1,
93 StateStrictWWM = 0x2,
94 StateStrictWQM = 0x4,
95 StateExact = 0x8,
96 StateStrict = StateStrictWWM | StateStrictWQM,
97};
98
99struct PrintState {
100public:
101 int State;
102
103 explicit PrintState(int State) : State(State) {}
104};
105
106#ifndef NDEBUG
107static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108
109 static const std::pair<char, const char *> Mapping[] = {
110 std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111 std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112 char State = PS.State;
113 for (auto M : Mapping) {
114 if (State & M.first) {
115 OS << M.second;
116 State &= ~M.first;
117
118 if (State)
119 OS << '|';
120 }
121 }
122 assert(State == 0);
123 return OS;
124}
125#endif
126
127struct InstrInfo {
128 char Needs = 0;
129 char Disabled = 0;
130 char OutNeeds = 0;
131};
132
133struct BlockInfo {
134 char Needs = 0;
135 char InNeeds = 0;
136 char OutNeeds = 0;
137 char InitialState = 0;
138 bool NeedsLowering = false;
139};
140
141struct WorkItem {
142 MachineBasicBlock *MBB = nullptr;
143 MachineInstr *MI = nullptr;
144
145 WorkItem() = default;
148};
149
150class SIWholeQuadMode : public MachineFunctionPass {
151private:
152 const SIInstrInfo *TII;
153 const SIRegisterInfo *TRI;
154 const GCNSubtarget *ST;
156 LiveIntervals *LIS;
159
160 unsigned AndOpc;
161 unsigned AndTermOpc;
162 unsigned AndN2Opc;
163 unsigned XorOpc;
164 unsigned AndSaveExecOpc;
165 unsigned AndSaveExecTermOpc;
166 unsigned WQMOpc;
167 Register Exec;
168 Register LiveMaskReg;
169
172
173 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175
176 SmallVector<MachineInstr *, 2> LiveMaskQueries;
177 SmallVector<MachineInstr *, 4> LowerToMovInstrs;
178 SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
180 SmallVector<MachineInstr *, 4> InitExecInstrs;
181
182 void printInfo();
183
184 void markInstruction(MachineInstr &MI, char Flag,
185 std::vector<WorkItem> &Worklist);
186 void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188 void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189 std::vector<WorkItem> &Worklist);
190 void markInstructionUses(const MachineInstr &MI, char Flag,
191 std::vector<WorkItem> &Worklist);
192 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
196
201 MachineBasicBlock::iterator Last, bool PreferLast,
202 bool SaveSCC);
204 Register SaveWQM);
206 Register SavedWQM);
208 Register SaveOrig, char StrictStateNeeded);
209 void fromStrictMode(MachineBasicBlock &MBB,
211 char NonStrictState, char CurrentStrictState);
212
214
216 bool IsWQM);
218
219 void lowerBlock(MachineBasicBlock &MBB);
220 void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221
222 void lowerLiveMaskQueries();
223 void lowerCopyInstrs();
224 void lowerKillInstrs(bool IsWQM);
225 void lowerInitExec(MachineInstr &MI);
226 MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
227
228public:
229 static char ID;
230
231 SIWholeQuadMode() :
233
234 bool runOnMachineFunction(MachineFunction &MF) override;
235
236 StringRef getPassName() const override { return "SI Whole Quad Mode"; }
237
238 void getAnalysisUsage(AnalysisUsage &AU) const override {
245 }
246
249 MachineFunctionProperties::Property::IsSSA);
250 }
251};
252
253} // end anonymous namespace
254
255char SIWholeQuadMode::ID = 0;
256
257INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258 false)
262INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263 false)
264
265char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
266
268 return new SIWholeQuadMode;
269}
270
271#ifndef NDEBUG
272LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273 for (const auto &BII : Blocks) {
274 dbgs() << "\n"
275 << printMBBReference(*BII.first) << ":\n"
276 << " InNeeds = " << PrintState(BII.second.InNeeds)
277 << ", Needs = " << PrintState(BII.second.Needs)
278 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
279
280 for (const MachineInstr &MI : *BII.first) {
281 auto III = Instructions.find(&MI);
282 if (III != Instructions.end()) {
283 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
284 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
285 }
286 }
287 }
288}
289#endif
290
291void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
292 std::vector<WorkItem> &Worklist) {
293 InstrInfo &II = Instructions[&MI];
294
295 assert(!(Flag & StateExact) && Flag != 0);
296
297 // Remove any disabled states from the flag. The user that required it gets
298 // an undefined value in the helper lanes. For example, this can happen if
299 // the result of an atomic is used by instruction that requires WQM, where
300 // ignoring the request for WQM is correct as per the relevant specs.
301 Flag &= ~II.Disabled;
302
303 // Ignore if the flag is already encompassed by the existing needs, or we
304 // just disabled everything.
305 if ((II.Needs & Flag) == Flag)
306 return;
307
308 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
309 II.Needs |= Flag;
310 Worklist.push_back(&MI);
311}
312
313/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
314void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
315 Register Reg, unsigned SubReg, char Flag,
316 std::vector<WorkItem> &Worklist) {
317 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
318
319 LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
320 const VNInfo *Value = UseLRQ.valueIn();
321 if (!Value)
322 return;
323
324 // Note: this code assumes that lane masks on AMDGPU completely
325 // cover registers.
326 const LaneBitmask UseLanes =
327 SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
328 : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330
331 // Perform a depth-first iteration of the LiveRange graph marking defs.
332 // Stop processing of a given branch when all use lanes have been defined.
333 // The first definition stops processing for a physical register.
334 struct PhiEntry {
335 const VNInfo *Phi;
336 unsigned PredIdx;
337 LaneBitmask DefinedLanes;
338
339 PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
340 : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
341 };
342 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344 SmallSet<VisitKey, 4> Visited;
345 LaneBitmask DefinedLanes;
346 unsigned NextPredIdx = 0; // Only used for processing phi nodes
347 do {
348 const VNInfo *NextValue = nullptr;
349 const VisitKey Key(Value, DefinedLanes);
350
351 if (Visited.insert(Key).second) {
352 // On first visit to a phi then start processing first predecessor
353 NextPredIdx = 0;
354 }
355
356 if (Value->isPHIDef()) {
357 // Each predecessor node in the phi must be processed as a subgraph
358 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
359 assert(MBB && "Phi-def has no defining MBB");
360
361 // Find next predecessor to process
362 unsigned Idx = NextPredIdx;
363 auto PI = MBB->pred_begin() + Idx;
364 auto PE = MBB->pred_end();
365 for (; PI != PE && !NextValue; ++PI, ++Idx) {
366 if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
367 if (!Visited.count(VisitKey(VN, DefinedLanes)))
368 NextValue = VN;
369 }
370 }
371
372 // If there are more predecessors to process; add phi to stack
373 if (PI != PE)
374 PhiStack.emplace_back(Value, Idx, DefinedLanes);
375 } else {
376 MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
377 assert(MI && "Def has no defining instruction");
378
379 if (Reg.isVirtual()) {
380 // Iterate over all operands to find relevant definitions
381 bool HasDef = false;
382 for (const MachineOperand &Op : MI->all_defs()) {
383 if (Op.getReg() != Reg)
384 continue;
385
386 // Compute lanes defined and overlap with use
387 LaneBitmask OpLanes =
388 Op.isUndef() ? LaneBitmask::getAll()
389 : TRI->getSubRegIndexLaneMask(Op.getSubReg());
390 LaneBitmask Overlap = (UseLanes & OpLanes);
391
392 // Record if this instruction defined any of use
393 HasDef |= Overlap.any();
394
395 // Mark any lanes defined
396 DefinedLanes |= OpLanes;
397 }
398
399 // Check if all lanes of use have been defined
400 if ((DefinedLanes & UseLanes) != UseLanes) {
401 // Definition not complete; need to process input value
402 LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
403 if (const VNInfo *VN = LRQ.valueIn()) {
404 if (!Visited.count(VisitKey(VN, DefinedLanes)))
405 NextValue = VN;
406 }
407 }
408
409 // Only mark the instruction if it defines some part of the use
410 if (HasDef)
411 markInstruction(*MI, Flag, Worklist);
412 } else {
413 // For physical registers simply mark the defining instruction
414 markInstruction(*MI, Flag, Worklist);
415 }
416 }
417
418 if (!NextValue && !PhiStack.empty()) {
419 // Reach end of chain; revert to processing last phi
420 PhiEntry &Entry = PhiStack.back();
421 NextValue = Entry.Phi;
422 NextPredIdx = Entry.PredIdx;
423 DefinedLanes = Entry.DefinedLanes;
424 PhiStack.pop_back();
425 }
426
427 Value = NextValue;
428 } while (Value);
429}
430
431void SIWholeQuadMode::markOperand(const MachineInstr &MI,
432 const MachineOperand &Op, char Flag,
433 std::vector<WorkItem> &Worklist) {
434 assert(Op.isReg());
435 Register Reg = Op.getReg();
436
437 // Ignore some hardware registers
438 switch (Reg) {
439 case AMDGPU::EXEC:
440 case AMDGPU::EXEC_LO:
441 return;
442 default:
443 break;
444 }
445
446 LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
447 << " for " << MI);
448 if (Reg.isVirtual()) {
449 LiveRange &LR = LIS->getInterval(Reg);
450 markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
451 } else {
452 // Handle physical registers that we need to track; this is mostly relevant
453 // for VCC, which can appear as the (implicit) input of a uniform branch,
454 // e.g. when a loop counter is stored in a VGPR.
455 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
456 LiveRange &LR = LIS->getRegUnit(Unit);
457 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
458 if (Value)
459 markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
460 }
461 }
462}
463
464/// Mark all instructions defining the uses in \p MI with \p Flag.
465void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
466 std::vector<WorkItem> &Worklist) {
467 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
468 << MI);
469
470 for (const MachineOperand &Use : MI.all_uses())
471 markOperand(MI, Use, Flag, Worklist);
472}
473
474// Scan instructions to determine which ones require an Exact execmask and
475// which ones seed WQM requirements.
476char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
477 std::vector<WorkItem> &Worklist) {
478 char GlobalFlags = 0;
479 bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
480 SmallVector<MachineInstr *, 4> SetInactiveInstrs;
481 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
482 bool HasImplicitDerivatives =
484
485 // We need to visit the basic blocks in reverse post-order so that we visit
486 // defs before uses, in particular so that we don't accidentally mark an
487 // instruction as needing e.g. WQM before visiting it and realizing it needs
488 // WQM disabled.
490 for (MachineBasicBlock *MBB : RPOT) {
491 BlockInfo &BBI = Blocks[MBB];
492
493 for (MachineInstr &MI : *MBB) {
494 InstrInfo &III = Instructions[&MI];
495 unsigned Opcode = MI.getOpcode();
496 char Flags = 0;
497
498 if (TII->isWQM(Opcode)) {
499 // If LOD is not supported WQM is not needed.
500 // Only generate implicit WQM if implicit derivatives are required.
501 // This avoids inserting unintended WQM if a shader type without
502 // implicit derivatives uses an image sampling instruction.
503 if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
504 // Sampling instructions don't need to produce results for all pixels
505 // in a quad, they just require all inputs of a quad to have been
506 // computed for derivatives.
507 markInstructionUses(MI, StateWQM, Worklist);
508 GlobalFlags |= StateWQM;
509 }
510 } else if (Opcode == AMDGPU::WQM) {
511 // The WQM intrinsic requires its output to have all the helper lanes
512 // correct, so we need it to be in WQM.
513 Flags = StateWQM;
514 LowerToCopyInstrs.push_back(&MI);
515 } else if (Opcode == AMDGPU::SOFT_WQM) {
516 LowerToCopyInstrs.push_back(&MI);
517 SoftWQMInstrs.push_back(&MI);
518 } else if (Opcode == AMDGPU::STRICT_WWM) {
519 // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
520 // it needs to be executed in WQM or Exact so that its copy doesn't
521 // clobber inactive lanes.
522 markInstructionUses(MI, StateStrictWWM, Worklist);
523 GlobalFlags |= StateStrictWWM;
524 LowerToMovInstrs.push_back(&MI);
525 } else if (Opcode == AMDGPU::STRICT_WQM ||
526 TII->isDualSourceBlendEXP(MI)) {
527 // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
528 // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
529 // quads that have at least one active thread.
530 markInstructionUses(MI, StateStrictWQM, Worklist);
531 GlobalFlags |= StateStrictWQM;
532
533 if (Opcode == AMDGPU::STRICT_WQM) {
534 LowerToMovInstrs.push_back(&MI);
535 } else {
536 // Dual source blend export acts as implicit strict-wqm, its sources
537 // need to be shuffled in strict wqm, but the export itself needs to
538 // run in exact mode.
539 BBI.Needs |= StateExact;
540 if (!(BBI.InNeeds & StateExact)) {
541 BBI.InNeeds |= StateExact;
542 Worklist.push_back(MBB);
543 }
544 GlobalFlags |= StateExact;
545 III.Disabled = StateWQM | StateStrict;
546 }
547 } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
548 Opcode == AMDGPU::DS_PARAM_LOAD ||
549 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
550 Opcode == AMDGPU::DS_DIRECT_LOAD) {
551 // Mark these STRICTWQM, but only for the instruction, not its operands.
552 // This avoid unnecessarily marking M0 as requiring WQM.
553 InstrInfo &II = Instructions[&MI];
554 II.Needs |= StateStrictWQM;
555 GlobalFlags |= StateStrictWQM;
556 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
557 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
558 III.Disabled = StateStrict;
559 MachineOperand &Inactive = MI.getOperand(2);
560 if (Inactive.isReg()) {
561 if (Inactive.isUndef()) {
562 LowerToCopyInstrs.push_back(&MI);
563 } else {
564 markOperand(MI, Inactive, StateStrictWWM, Worklist);
565 }
566 }
567 SetInactiveInstrs.push_back(&MI);
568 } else if (TII->isDisableWQM(MI)) {
569 BBI.Needs |= StateExact;
570 if (!(BBI.InNeeds & StateExact)) {
571 BBI.InNeeds |= StateExact;
572 Worklist.push_back(MBB);
573 }
574 GlobalFlags |= StateExact;
575 III.Disabled = StateWQM | StateStrict;
576 } else if (Opcode == AMDGPU::SI_PS_LIVE ||
577 Opcode == AMDGPU::SI_LIVE_MASK) {
578 LiveMaskQueries.push_back(&MI);
579 } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
580 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
581 Opcode == AMDGPU::SI_DEMOTE_I1) {
582 KillInstrs.push_back(&MI);
583 BBI.NeedsLowering = true;
584 } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
585 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
586 InitExecInstrs.push_back(&MI);
587 } else if (WQMOutputs) {
588 // The function is in machine SSA form, which means that physical
589 // VGPRs correspond to shader inputs and outputs. Inputs are
590 // only used, outputs are only defined.
591 // FIXME: is this still valid?
592 for (const MachineOperand &MO : MI.defs()) {
593 Register Reg = MO.getReg();
594 if (Reg.isPhysical() &&
595 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
596 Flags = StateWQM;
597 break;
598 }
599 }
600 }
601
602 if (Flags) {
603 markInstruction(MI, Flags, Worklist);
604 GlobalFlags |= Flags;
605 }
606 }
607 }
608
609 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
610 // ever used anywhere in the function. This implements the corresponding
611 // semantics of @llvm.amdgcn.set.inactive.
612 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
613 if (GlobalFlags & StateWQM) {
614 for (MachineInstr *MI : SetInactiveInstrs)
615 markInstruction(*MI, StateWQM, Worklist);
616 for (MachineInstr *MI : SoftWQMInstrs)
617 markInstruction(*MI, StateWQM, Worklist);
618 }
619
620 return GlobalFlags;
621}
622
623void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
624 std::vector<WorkItem>& Worklist) {
625 MachineBasicBlock *MBB = MI.getParent();
626 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
627 BlockInfo &BI = Blocks[MBB];
628
629 // Control flow-type instructions and stores to temporary memory that are
630 // followed by WQM computations must themselves be in WQM.
631 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
632 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
633 Instructions[&MI].Needs = StateWQM;
634 II.Needs = StateWQM;
635 }
636
637 // Propagate to block level
638 if (II.Needs & StateWQM) {
639 BI.Needs |= StateWQM;
640 if (!(BI.InNeeds & StateWQM)) {
641 BI.InNeeds |= StateWQM;
642 Worklist.push_back(MBB);
643 }
644 }
645
646 // Propagate backwards within block
647 if (MachineInstr *PrevMI = MI.getPrevNode()) {
648 char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
649 if (!PrevMI->isPHI()) {
650 InstrInfo &PrevII = Instructions[PrevMI];
651 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
652 PrevII.OutNeeds |= InNeeds;
653 Worklist.push_back(PrevMI);
654 }
655 }
656 }
657
658 // Propagate WQM flag to instruction inputs
659 assert(!(II.Needs & StateExact));
660
661 if (II.Needs != 0)
662 markInstructionUses(MI, II.Needs, Worklist);
663
664 // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
665 // not require any WQM transitions.
666 if (II.Needs & StateStrictWWM)
667 BI.Needs |= StateStrictWWM;
668 if (II.Needs & StateStrictWQM)
669 BI.Needs |= StateStrictWQM;
670}
671
672void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
673 std::vector<WorkItem>& Worklist) {
674 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
675
676 // Propagate through instructions
677 if (!MBB.empty()) {
678 MachineInstr *LastMI = &*MBB.rbegin();
679 InstrInfo &LastII = Instructions[LastMI];
680 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
681 LastII.OutNeeds |= BI.OutNeeds;
682 Worklist.push_back(LastMI);
683 }
684 }
685
686 // Predecessor blocks must provide for our WQM/Exact needs.
687 for (MachineBasicBlock *Pred : MBB.predecessors()) {
688 BlockInfo &PredBI = Blocks[Pred];
689 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
690 continue;
691
692 PredBI.OutNeeds |= BI.InNeeds;
693 PredBI.InNeeds |= BI.InNeeds;
694 Worklist.push_back(Pred);
695 }
696
697 // All successors must be prepared to accept the same set of WQM/Exact data.
698 for (MachineBasicBlock *Succ : MBB.successors()) {
699 BlockInfo &SuccBI = Blocks[Succ];
700 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
701 continue;
702
703 SuccBI.InNeeds |= BI.OutNeeds;
704 Worklist.push_back(Succ);
705 }
706}
707
708char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
709 std::vector<WorkItem> Worklist;
710 char GlobalFlags = scanInstructions(MF, Worklist);
711
712 while (!Worklist.empty()) {
713 WorkItem WI = Worklist.back();
714 Worklist.pop_back();
715
716 if (WI.MI)
717 propagateInstruction(*WI.MI, Worklist);
718 else
719 propagateBlock(*WI.MBB, Worklist);
720 }
721
722 return GlobalFlags;
723}
724
726SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
728 Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
729
730 MachineInstr *Save =
731 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
732 .addReg(AMDGPU::SCC);
733 MachineInstr *Restore =
734 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
735 .addReg(SaveReg);
736
737 LIS->InsertMachineInstrInMaps(*Save);
738 LIS->InsertMachineInstrInMaps(*Restore);
739 LIS->createAndComputeVirtRegInterval(SaveReg);
740
741 return Restore;
742}
743
744MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
745 MachineInstr *TermMI) {
746 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
747 << *TermMI << "\n");
748
749 MachineBasicBlock *SplitBB =
750 BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
751
752 // Convert last instruction in block to a terminator.
753 // Note: this only covers the expected patterns
754 unsigned NewOpcode = 0;
755 switch (TermMI->getOpcode()) {
756 case AMDGPU::S_AND_B32:
757 NewOpcode = AMDGPU::S_AND_B32_term;
758 break;
759 case AMDGPU::S_AND_B64:
760 NewOpcode = AMDGPU::S_AND_B64_term;
761 break;
762 case AMDGPU::S_MOV_B32:
763 NewOpcode = AMDGPU::S_MOV_B32_term;
764 break;
765 case AMDGPU::S_MOV_B64:
766 NewOpcode = AMDGPU::S_MOV_B64_term;
767 break;
768 default:
769 break;
770 }
771 if (NewOpcode)
772 TermMI->setDesc(TII->get(NewOpcode));
773
774 if (SplitBB != BB) {
775 // Update dominator trees
776 using DomTreeT = DomTreeBase<MachineBasicBlock>;
778 for (MachineBasicBlock *Succ : SplitBB->successors()) {
779 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
780 DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
781 }
782 DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
783 if (MDT)
784 MDT->getBase().applyUpdates(DTUpdates);
785 if (PDT)
786 PDT->applyUpdates(DTUpdates);
787
788 // Link blocks
790 BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
791 .addMBB(SplitBB);
792 LIS->InsertMachineInstrInMaps(*MI);
793 }
794
795 return SplitBB;
796}
797
798MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
799 MachineInstr &MI) {
800 const DebugLoc &DL = MI.getDebugLoc();
801 unsigned Opcode = 0;
802
803 assert(MI.getOperand(0).isReg());
804
805 // Comparison is for live lanes; however here we compute the inverse
806 // (killed lanes). This is because VCMP will always generate 0 bits
807 // for inactive lanes so a mask of live lanes would not be correct
808 // inside control flow.
809 // Invert the comparison by swapping the operands and adjusting
810 // the comparison codes.
811
812 switch (MI.getOperand(2).getImm()) {
813 case ISD::SETUEQ:
814 Opcode = AMDGPU::V_CMP_LG_F32_e64;
815 break;
816 case ISD::SETUGT:
817 Opcode = AMDGPU::V_CMP_GE_F32_e64;
818 break;
819 case ISD::SETUGE:
820 Opcode = AMDGPU::V_CMP_GT_F32_e64;
821 break;
822 case ISD::SETULT:
823 Opcode = AMDGPU::V_CMP_LE_F32_e64;
824 break;
825 case ISD::SETULE:
826 Opcode = AMDGPU::V_CMP_LT_F32_e64;
827 break;
828 case ISD::SETUNE:
829 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
830 break;
831 case ISD::SETO:
832 Opcode = AMDGPU::V_CMP_O_F32_e64;
833 break;
834 case ISD::SETUO:
835 Opcode = AMDGPU::V_CMP_U_F32_e64;
836 break;
837 case ISD::SETOEQ:
838 case ISD::SETEQ:
839 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
840 break;
841 case ISD::SETOGT:
842 case ISD::SETGT:
843 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
844 break;
845 case ISD::SETOGE:
846 case ISD::SETGE:
847 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
848 break;
849 case ISD::SETOLT:
850 case ISD::SETLT:
851 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
852 break;
853 case ISD::SETOLE:
854 case ISD::SETLE:
855 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
856 break;
857 case ISD::SETONE:
858 case ISD::SETNE:
859 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
860 break;
861 default:
862 llvm_unreachable("invalid ISD:SET cond code");
863 }
864
865 // Pick opcode based on comparison type.
866 MachineInstr *VcmpMI;
867 const MachineOperand &Op0 = MI.getOperand(0);
868 const MachineOperand &Op1 = MI.getOperand(1);
869
870 // VCC represents lanes killed.
871 Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
872
873 if (TRI->isVGPR(*MRI, Op0.getReg())) {
874 Opcode = AMDGPU::getVOPe32(Opcode);
875 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
876 } else {
877 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
879 .addImm(0) // src0 modifiers
880 .add(Op1)
881 .addImm(0) // src1 modifiers
882 .add(Op0)
883 .addImm(0); // omod
884 }
885
886 MachineInstr *MaskUpdateMI =
887 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
888 .addReg(LiveMaskReg)
889 .addReg(VCC);
890
891 // State of SCC represents whether any lanes are live in mask,
892 // if SCC is 0 then no lanes will be alive anymore.
893 MachineInstr *EarlyTermMI =
894 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
895
896 MachineInstr *ExecMaskMI =
897 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
898
899 assert(MBB.succ_size() == 1);
900 MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
901 .addMBB(*MBB.succ_begin());
902
903 // Update live intervals
904 LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
905 MBB.remove(&MI);
906
907 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
908 LIS->InsertMachineInstrInMaps(*ExecMaskMI);
909 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
910 LIS->InsertMachineInstrInMaps(*NewTerm);
911
912 return NewTerm;
913}
914
915MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
916 MachineInstr &MI, bool IsWQM) {
917 const DebugLoc &DL = MI.getDebugLoc();
918 MachineInstr *MaskUpdateMI = nullptr;
919
920 const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
921 const MachineOperand &Op = MI.getOperand(0);
922 int64_t KillVal = MI.getOperand(1).getImm();
923 MachineInstr *ComputeKilledMaskMI = nullptr;
924 Register CndReg = !Op.isImm() ? Op.getReg() : Register();
925 Register TmpReg;
926
927 // Is this a static or dynamic kill?
928 if (Op.isImm()) {
929 if (Op.getImm() == KillVal) {
930 // Static: all active lanes are killed
931 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
932 .addReg(LiveMaskReg)
933 .addReg(Exec);
934 } else {
935 // Static: kill does nothing
936 MachineInstr *NewTerm = nullptr;
937 if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
938 LIS->RemoveMachineInstrFromMaps(MI);
939 } else {
940 assert(MBB.succ_size() == 1);
941 NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
942 .addMBB(*MBB.succ_begin());
943 LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
944 }
945 MBB.remove(&MI);
946 return NewTerm;
947 }
948 } else {
949 if (!KillVal) {
950 // Op represents live lanes after kill,
951 // so exec mask needs to be factored in.
952 TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
953 ComputeKilledMaskMI =
954 BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
955 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
956 .addReg(LiveMaskReg)
957 .addReg(TmpReg);
958 } else {
959 // Op represents lanes to kill
960 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
961 .addReg(LiveMaskReg)
962 .add(Op);
963 }
964 }
965
966 // State of SCC represents whether any lanes are live in mask,
967 // if SCC is 0 then no lanes will be alive anymore.
968 MachineInstr *EarlyTermMI =
969 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
970
971 // In the case we got this far some lanes are still live,
972 // update EXEC to deactivate lanes as appropriate.
973 MachineInstr *NewTerm;
974 MachineInstr *WQMMaskMI = nullptr;
975 Register LiveMaskWQM;
976 if (IsDemote) {
977 // Demote - deactivate quads with only helper lanes
978 LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
979 WQMMaskMI =
980 BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
981 NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
982 .addReg(Exec)
983 .addReg(LiveMaskWQM);
984 } else {
985 // Kill - deactivate lanes no longer in live mask
986 if (Op.isImm()) {
987 unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
988 NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
989 } else if (!IsWQM) {
990 NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
991 .addReg(Exec)
992 .addReg(LiveMaskReg);
993 } else {
994 unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
995 NewTerm =
996 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
997 }
998 }
999
1000 // Update live intervals
1001 LIS->RemoveMachineInstrFromMaps(MI);
1002 MBB.remove(&MI);
1003 assert(EarlyTermMI);
1004 assert(MaskUpdateMI);
1005 assert(NewTerm);
1006 if (ComputeKilledMaskMI)
1007 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1008 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1009 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1010 if (WQMMaskMI)
1011 LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1012 LIS->InsertMachineInstrInMaps(*NewTerm);
1013
1014 if (CndReg) {
1015 LIS->removeInterval(CndReg);
1016 LIS->createAndComputeVirtRegInterval(CndReg);
1017 }
1018 if (TmpReg)
1019 LIS->createAndComputeVirtRegInterval(TmpReg);
1020 if (LiveMaskWQM)
1021 LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1022
1023 return NewTerm;
1024}
1025
1026// Replace (or supplement) instructions accessing live mask.
1027// This can only happen once all the live mask registers have been created
1028// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1029void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1030 auto BII = Blocks.find(&MBB);
1031 if (BII == Blocks.end())
1032 return;
1033
1034 const BlockInfo &BI = BII->second;
1035 if (!BI.NeedsLowering)
1036 return;
1037
1038 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1039
1041 char State = BI.InitialState;
1042
1045 if (StateTransition.count(&MI))
1046 State = StateTransition[&MI];
1047
1048 MachineInstr *SplitPoint = nullptr;
1049 switch (MI.getOpcode()) {
1050 case AMDGPU::SI_DEMOTE_I1:
1051 case AMDGPU::SI_KILL_I1_TERMINATOR:
1052 SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1053 break;
1054 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1055 SplitPoint = lowerKillF32(MBB, MI);
1056 break;
1057 default:
1058 break;
1059 }
1060 if (SplitPoint)
1061 SplitPoints.push_back(SplitPoint);
1062 }
1063
1064 // Perform splitting after instruction scan to simplify iteration.
1065 if (!SplitPoints.empty()) {
1066 MachineBasicBlock *BB = &MBB;
1067 for (MachineInstr *MI : SplitPoints) {
1068 BB = splitBlock(BB, MI);
1069 }
1070 }
1071}
1072
1073// Return an iterator in the (inclusive) range [First, Last] at which
1074// instructions can be safely inserted, keeping in mind that some of the
1075// instructions we want to add necessarily clobber SCC.
1076MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1078 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1079 if (!SaveSCC)
1080 return PreferLast ? Last : First;
1081
1082 LiveRange &LR =
1083 LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1084 auto MBBE = MBB.end();
1085 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1086 : LIS->getMBBEndIdx(&MBB);
1087 SlotIndex LastIdx =
1088 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1089 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1090 const LiveRange::Segment *S;
1091
1092 for (;;) {
1093 S = LR.getSegmentContaining(Idx);
1094 if (!S)
1095 break;
1096
1097 if (PreferLast) {
1098 SlotIndex Next = S->start.getBaseIndex();
1099 if (Next < FirstIdx)
1100 break;
1101 Idx = Next;
1102 } else {
1103 MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1104 assert(EndMI && "Segment does not end on valid instruction");
1105 auto NextI = std::next(EndMI->getIterator());
1106 if (NextI == MBB.end())
1107 break;
1108 SlotIndex Next = LIS->getInstructionIndex(*NextI);
1109 if (Next > LastIdx)
1110 break;
1111 Idx = Next;
1112 }
1113 }
1114
1116
1117 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1118 MBBI = MI;
1119 else {
1120 assert(Idx == LIS->getMBBEndIdx(&MBB));
1121 MBBI = MBB.end();
1122 }
1123
1124 // Move insertion point past any operations modifying EXEC.
1125 // This assumes that the value of SCC defined by any of these operations
1126 // does not need to be preserved.
1127 while (MBBI != Last) {
1128 bool IsExecDef = false;
1129 for (const MachineOperand &MO : MBBI->all_defs()) {
1130 IsExecDef |=
1131 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1132 }
1133 if (!IsExecDef)
1134 break;
1135 MBBI++;
1136 S = nullptr;
1137 }
1138
1139 if (S)
1140 MBBI = saveSCC(MBB, MBBI);
1141
1142 return MBBI;
1143}
1144
1145void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1147 Register SaveWQM) {
1148 bool IsTerminator = Before == MBB.end();
1149 if (!IsTerminator) {
1150 auto FirstTerm = MBB.getFirstTerminator();
1151 if (FirstTerm != MBB.end()) {
1152 SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1153 SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1154 IsTerminator = BeforeIdx > FirstTermIdx;
1155 }
1156 }
1157
1159
1160 if (SaveWQM) {
1161 unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1162 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1163 .addReg(LiveMaskReg);
1164 } else {
1165 unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1166 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1167 .addReg(Exec)
1168 .addReg(LiveMaskReg);
1169 }
1170
1171 LIS->InsertMachineInstrInMaps(*MI);
1172 StateTransition[MI] = StateExact;
1173}
1174
1175void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1177 Register SavedWQM) {
1179
1180 if (SavedWQM) {
1181 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1182 .addReg(SavedWQM);
1183 } else {
1184 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1185 }
1186
1187 LIS->InsertMachineInstrInMaps(*MI);
1188 StateTransition[MI] = StateWQM;
1189}
1190
1191void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1193 Register SaveOrig, char StrictStateNeeded) {
1195 assert(SaveOrig);
1196 assert(StrictStateNeeded == StateStrictWWM ||
1197 StrictStateNeeded == StateStrictWQM);
1198
1199 if (StrictStateNeeded == StateStrictWWM) {
1200 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1201 SaveOrig)
1202 .addImm(-1);
1203 } else {
1204 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1205 SaveOrig)
1206 .addImm(-1);
1207 }
1208 LIS->InsertMachineInstrInMaps(*MI);
1209 StateTransition[MI] = StrictStateNeeded;
1210}
1211
1212void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1214 Register SavedOrig, char NonStrictState,
1215 char CurrentStrictState) {
1217
1218 assert(SavedOrig);
1219 assert(CurrentStrictState == StateStrictWWM ||
1220 CurrentStrictState == StateStrictWQM);
1221
1222 if (CurrentStrictState == StateStrictWWM) {
1223 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1224 Exec)
1225 .addReg(SavedOrig);
1226 } else {
1227 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1228 Exec)
1229 .addReg(SavedOrig);
1230 }
1231 LIS->InsertMachineInstrInMaps(*MI);
1232 StateTransition[MI] = NonStrictState;
1233}
1234
1235void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1236 auto BII = Blocks.find(&MBB);
1237 if (BII == Blocks.end())
1238 return;
1239
1240 BlockInfo &BI = BII->second;
1241
1242 // This is a non-entry block that is WQM throughout, so no need to do
1243 // anything.
1244 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1245 BI.InitialState = StateWQM;
1246 return;
1247 }
1248
1249 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1250 << ":\n");
1251
1252 Register SavedWQMReg;
1253 Register SavedNonStrictReg;
1254 bool WQMFromExec = IsEntry;
1255 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1256 char NonStrictState = 0;
1257 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1258
1259 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1260 if (IsEntry) {
1261 // Skip the instruction that saves LiveMask
1262 if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1263 II->getOperand(1).getReg() == TRI->getExec())
1264 ++II;
1265 }
1266
1267 // This stores the first instruction where it's safe to switch from WQM to
1268 // Exact or vice versa.
1270
1271 // This stores the first instruction where it's safe to switch from Strict
1272 // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1273 // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1274 // be safe to switch to/from WQM as well.
1275 MachineBasicBlock::iterator FirstStrict = IE;
1276
1277 // Record initial state is block information.
1278 BI.InitialState = State;
1279
1280 for (;;) {
1282 char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1283 char OutNeeds = 0;
1284
1285 if (FirstWQM == IE)
1286 FirstWQM = II;
1287
1288 if (FirstStrict == IE)
1289 FirstStrict = II;
1290
1291 // First, figure out the allowed states (Needs) based on the propagated
1292 // flags.
1293 if (II != IE) {
1294 MachineInstr &MI = *II;
1295
1296 if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1297 auto III = Instructions.find(&MI);
1298 if (III != Instructions.end()) {
1299 if (III->second.Needs & StateStrictWWM)
1300 Needs = StateStrictWWM;
1301 else if (III->second.Needs & StateStrictWQM)
1302 Needs = StateStrictWQM;
1303 else if (III->second.Needs & StateWQM)
1304 Needs = StateWQM;
1305 else
1306 Needs &= ~III->second.Disabled;
1307 OutNeeds = III->second.OutNeeds;
1308 }
1309 } else {
1310 // If the instruction doesn't actually need a correct EXEC, then we can
1311 // safely leave Strict mode enabled.
1312 Needs = StateExact | StateWQM | StateStrict;
1313 }
1314
1315 // Exact mode exit can occur in terminators, but must be before branches.
1316 if (MI.isBranch() && OutNeeds == StateExact)
1317 Needs = StateExact;
1318
1319 ++Next;
1320 } else {
1321 // End of basic block
1322 if (BI.OutNeeds & StateWQM)
1323 Needs = StateWQM;
1324 else if (BI.OutNeeds == StateExact)
1325 Needs = StateExact;
1326 else
1327 Needs = StateWQM | StateExact;
1328 }
1329
1330 // Now, transition if necessary.
1331 if (!(Needs & State)) {
1333 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1334 State == StateStrictWQM || Needs == StateStrictWQM) {
1335 // We must switch to or from Strict mode.
1336 First = FirstStrict;
1337 } else {
1338 // We only need to switch to/from WQM, so we can use FirstWQM.
1339 First = FirstWQM;
1340 }
1341
1342 // Whether we need to save SCC depends on start and end states.
1343 bool SaveSCC = false;
1344 switch (State) {
1345 case StateExact:
1346 case StateStrictWWM:
1347 case StateStrictWQM:
1348 // Exact/Strict -> Strict: save SCC
1349 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1350 // Exact/Strict -> Exact: no save
1351 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1352 break;
1353 case StateWQM:
1354 // WQM -> Exact/Strict: save SCC
1355 SaveSCC = !(Needs & StateWQM);
1356 break;
1357 default:
1358 llvm_unreachable("Unknown state");
1359 break;
1360 }
1362 prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1363
1364 if (State & StateStrict) {
1365 assert(State == StateStrictWWM || State == StateStrictWQM);
1366 assert(SavedNonStrictReg);
1367 fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1368
1369 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1370 SavedNonStrictReg = 0;
1371 State = NonStrictState;
1372 }
1373
1374 if (Needs & StateStrict) {
1375 NonStrictState = State;
1376 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1377 assert(!SavedNonStrictReg);
1378 SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1379
1380 toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1381 State = Needs;
1382
1383 } else {
1384 if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1385 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1386 assert(!SavedWQMReg);
1387 SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1388 }
1389
1390 toExact(MBB, Before, SavedWQMReg);
1391 State = StateExact;
1392 } else if (State == StateExact && (Needs & StateWQM) &&
1393 !(Needs & StateExact)) {
1394 assert(WQMFromExec == (SavedWQMReg == 0));
1395
1396 toWQM(MBB, Before, SavedWQMReg);
1397
1398 if (SavedWQMReg) {
1399 LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1400 SavedWQMReg = 0;
1401 }
1402 State = StateWQM;
1403 } else {
1404 // We can get here if we transitioned from StrictWWM to a
1405 // non-StrictWWM state that already matches our needs, but we
1406 // shouldn't need to do anything.
1407 assert(Needs & State);
1408 }
1409 }
1410 }
1411
1412 if (Needs != (StateExact | StateWQM | StateStrict)) {
1413 if (Needs != (StateExact | StateWQM))
1414 FirstWQM = IE;
1415 FirstStrict = IE;
1416 }
1417
1418 if (II == IE)
1419 break;
1420
1421 II = Next;
1422 }
1423 assert(!SavedWQMReg);
1424 assert(!SavedNonStrictReg);
1425}
1426
1427void SIWholeQuadMode::lowerLiveMaskQueries() {
1428 for (MachineInstr *MI : LiveMaskQueries) {
1429 const DebugLoc &DL = MI->getDebugLoc();
1430 Register Dest = MI->getOperand(0).getReg();
1431
1433 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1434 .addReg(LiveMaskReg);
1435
1436 LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1437 MI->eraseFromParent();
1438 }
1439}
1440
1441void SIWholeQuadMode::lowerCopyInstrs() {
1442 for (MachineInstr *MI : LowerToMovInstrs) {
1443 assert(MI->getNumExplicitOperands() == 2);
1444
1445 const Register Reg = MI->getOperand(0).getReg();
1446
1447 const TargetRegisterClass *regClass =
1448 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1449 if (TRI->isVGPRClass(regClass)) {
1450 const unsigned MovOp = TII->getMovOpcode(regClass);
1451 MI->setDesc(TII->get(MovOp));
1452
1453 // Check that it already implicitly depends on exec (like all VALU movs
1454 // should do).
1455 assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1456 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1457 }));
1458 } else {
1459 // Remove early-clobber and exec dependency from simple SGPR copies.
1460 // This allows some to be eliminated during/post RA.
1461 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1462 if (MI->getOperand(0).isEarlyClobber()) {
1463 LIS->removeInterval(Reg);
1464 MI->getOperand(0).setIsEarlyClobber(false);
1465 LIS->createAndComputeVirtRegInterval(Reg);
1466 }
1467 int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1468 while (Index >= 0) {
1469 MI->removeOperand(Index);
1470 Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1471 }
1472 MI->setDesc(TII->get(AMDGPU::COPY));
1473 LLVM_DEBUG(dbgs() << " -> " << *MI);
1474 }
1475 }
1476 for (MachineInstr *MI : LowerToCopyInstrs) {
1477 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1478 MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1479 assert(MI->getNumExplicitOperands() == 3);
1480 // the only reason we should be here is V_SET_INACTIVE has
1481 // an undef input so it is being replaced by a simple copy.
1482 // There should be a second undef source that we should remove.
1483 assert(MI->getOperand(2).isUndef());
1484 MI->removeOperand(2);
1485 MI->untieRegOperand(1);
1486 } else {
1487 assert(MI->getNumExplicitOperands() == 2);
1488 }
1489
1490 unsigned CopyOp = MI->getOperand(1).isReg()
1491 ? (unsigned)AMDGPU::COPY
1492 : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1493 *MRI, MI->getOperand(0)));
1494 MI->setDesc(TII->get(CopyOp));
1495 }
1496}
1497
1498void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1499 for (MachineInstr *MI : KillInstrs) {
1500 MachineBasicBlock *MBB = MI->getParent();
1501 MachineInstr *SplitPoint = nullptr;
1502 switch (MI->getOpcode()) {
1503 case AMDGPU::SI_DEMOTE_I1:
1504 case AMDGPU::SI_KILL_I1_TERMINATOR:
1505 SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1506 break;
1507 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1508 SplitPoint = lowerKillF32(*MBB, *MI);
1509 break;
1510 }
1511 if (SplitPoint)
1512 splitBlock(MBB, SplitPoint);
1513 }
1514}
1515
1516void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1517 MachineBasicBlock *MBB = MI.getParent();
1518 bool IsWave32 = ST->isWave32();
1519
1520 if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1521 // This should be before all vector instructions.
1522 MachineInstr *InitMI =
1523 BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1524 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1525 Exec)
1526 .addImm(MI.getOperand(0).getImm());
1527 if (LIS) {
1528 LIS->RemoveMachineInstrFromMaps(MI);
1529 LIS->InsertMachineInstrInMaps(*InitMI);
1530 }
1531 MI.eraseFromParent();
1532 return;
1533 }
1534
1535 // Extract the thread count from an SGPR input and set EXEC accordingly.
1536 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1537 //
1538 // S_BFE_U32 count, input, {shift, 7}
1539 // S_BFM_B64 exec, count, 0
1540 // S_CMP_EQ_U32 count, 64
1541 // S_CMOV_B64 exec, -1
1542 Register InputReg = MI.getOperand(0).getReg();
1543 MachineInstr *FirstMI = &*MBB->begin();
1544 if (InputReg.isVirtual()) {
1545 MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1546 assert(DefInstr && DefInstr->isCopy());
1547 if (DefInstr->getParent() == MBB) {
1548 if (DefInstr != FirstMI) {
1549 // If the `InputReg` is defined in current block, we also need to
1550 // move that instruction to the beginning of the block.
1551 DefInstr->removeFromParent();
1552 MBB->insert(FirstMI, DefInstr);
1553 if (LIS)
1554 LIS->handleMove(*DefInstr);
1555 } else {
1556 // If first instruction is definition then move pointer after it.
1557 FirstMI = &*std::next(FirstMI->getIterator());
1558 }
1559 }
1560 }
1561
1562 // Insert instruction sequence at block beginning (before vector operations).
1563 const DebugLoc DL = MI.getDebugLoc();
1564 const unsigned WavefrontSize = ST->getWavefrontSize();
1565 const unsigned Mask = (WavefrontSize << 1) - 1;
1566 Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1567 auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1568 .addReg(InputReg)
1569 .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1570 auto BfmMI =
1571 BuildMI(*MBB, FirstMI, DL,
1572 TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1573 .addReg(CountReg)
1574 .addImm(0);
1575 auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1576 .addReg(CountReg, RegState::Kill)
1577 .addImm(WavefrontSize);
1578 auto CmovMI =
1579 BuildMI(*MBB, FirstMI, DL,
1580 TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1581 Exec)
1582 .addImm(-1);
1583
1584 if (!LIS) {
1585 MI.eraseFromParent();
1586 return;
1587 }
1588
1589 LIS->RemoveMachineInstrFromMaps(MI);
1590 MI.eraseFromParent();
1591
1592 LIS->InsertMachineInstrInMaps(*BfeMI);
1593 LIS->InsertMachineInstrInMaps(*BfmMI);
1594 LIS->InsertMachineInstrInMaps(*CmpMI);
1595 LIS->InsertMachineInstrInMaps(*CmovMI);
1596
1597 LIS->removeInterval(InputReg);
1598 LIS->createAndComputeVirtRegInterval(InputReg);
1599 LIS->createAndComputeVirtRegInterval(CountReg);
1600}
1601
1602/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1603/// for instructions that depend on EXEC.
1605SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
1606 MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1607
1608 for (MachineInstr *MI : InitExecInstrs) {
1609 // Try to handle undefined cases gracefully:
1610 // - multiple INIT_EXEC instructions
1611 // - INIT_EXEC instructions not in the entry block
1612 if (MI->getParent() == &Entry)
1613 InsertPt = std::next(MI->getIterator());
1614
1615 lowerInitExec(*MI);
1616 }
1617
1618 return InsertPt;
1619}
1620
1621bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1622 LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1623 << " ------------- \n");
1624 LLVM_DEBUG(MF.dump(););
1625
1626 Instructions.clear();
1627 Blocks.clear();
1628 LiveMaskQueries.clear();
1629 LowerToCopyInstrs.clear();
1630 LowerToMovInstrs.clear();
1631 KillInstrs.clear();
1632 InitExecInstrs.clear();
1633 StateTransition.clear();
1634
1635 ST = &MF.getSubtarget<GCNSubtarget>();
1636
1637 TII = ST->getInstrInfo();
1638 TRI = &TII->getRegisterInfo();
1639 MRI = &MF.getRegInfo();
1640 LIS = &getAnalysis<LiveIntervals>();
1641 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1642 MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1643 auto *PDTWrapper =
1644 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1645 PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1646
1647 if (ST->isWave32()) {
1648 AndOpc = AMDGPU::S_AND_B32;
1649 AndTermOpc = AMDGPU::S_AND_B32_term;
1650 AndN2Opc = AMDGPU::S_ANDN2_B32;
1651 XorOpc = AMDGPU::S_XOR_B32;
1652 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1653 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1654 WQMOpc = AMDGPU::S_WQM_B32;
1655 Exec = AMDGPU::EXEC_LO;
1656 } else {
1657 AndOpc = AMDGPU::S_AND_B64;
1658 AndTermOpc = AMDGPU::S_AND_B64_term;
1659 AndN2Opc = AMDGPU::S_ANDN2_B64;
1660 XorOpc = AMDGPU::S_XOR_B64;
1661 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1662 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1663 WQMOpc = AMDGPU::S_WQM_B64;
1664 Exec = AMDGPU::EXEC;
1665 }
1666
1667 const char GlobalFlags = analyzeFunction(MF);
1668 const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1669
1670 LiveMaskReg = Exec;
1671
1673 MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry);
1674
1675 // Shader is simple does not need any state changes or any complex lowering
1676 if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1677 LowerToMovInstrs.empty() && KillInstrs.empty()) {
1678 lowerLiveMaskQueries();
1679 return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
1680 }
1681
1682 // Store a copy of the original live mask when required
1683 if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1684 LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1685 MachineInstr *MI =
1686 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1687 .addReg(Exec);
1688 LIS->InsertMachineInstrInMaps(*MI);
1689 }
1690
1691 LLVM_DEBUG(printInfo());
1692
1693 lowerLiveMaskQueries();
1694 lowerCopyInstrs();
1695
1696 // Shader only needs WQM
1697 if (GlobalFlags == StateWQM) {
1698 auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1699 .addReg(Exec);
1700 LIS->InsertMachineInstrInMaps(*MI);
1701 lowerKillInstrs(true);
1702 } else {
1703 for (auto BII : Blocks)
1704 processBlock(*BII.first, BII.first == &Entry);
1705 // Lowering blocks causes block splitting so perform as a second pass.
1706 for (auto BII : Blocks)
1707 lowerBlock(*BII.first);
1708 }
1709
1710 // Compute live range for live mask
1711 if (LiveMaskReg != Exec)
1712 LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1713
1714 // Physical registers like SCC aren't tracked by default anyway, so just
1715 // removing the ranges we computed is the simplest option for maintaining
1716 // the analysis results.
1717 LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1718
1719 // If we performed any kills then recompute EXEC
1720 if (!KillInstrs.empty())
1721 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1722
1723 return true;
1724}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Provides AMDGPU specific target descriptions.
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
uint64_t IntrinsicInst * II
if(VerifyEach)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI Whole Quad Mode
#define DEBUG_TYPE
raw_pwrite_stream & OS
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Core dominator tree base class.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:690
Result of a LiveRange query.
Definition: LiveInterval.h:90
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
Definition: LiveInterval.h:105
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:408
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:542
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarilly including Idx,...
Definition: LiveInterval.h:429
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:74
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned succ_size() const
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual MachineFunctionProperties getClearedProperties() const
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool isCopy() const
MachineInstr * removeFromParent()
Unlink 'this' from the containing basic block, and return it without deleting it.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:64
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:223
SlotIndexes pass.
Definition: SlotIndexes.h:296
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
LLVM Value Representation.
Definition: Value.h:74
self_iterator getIterator()
Definition: ilist_node.h:132
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char WavefrontSize[]
Key for Kernel::CodeProps::Metadata::mWavefrontSize.
Key
PAL metadata keys.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:148
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createSIWholeQuadModePass()
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
char & SIWholeQuadModeID
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
static constexpr LaneBitmask getAll()
Definition: LaneBitmask.h:82
constexpr bool any() const
Definition: LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:162