LLVM 23.0.0git
AMDGPUGlobalISelDivergenceLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// GlobalISel pass that selects divergent i1 phis as lane mask phis.
11/// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
12/// Handles all cases of temporal divergence.
13/// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
14/// currently depends on LCSSA to insert phis with one incoming.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPU.h"
20#include "SILowerI1Copies.h"
25
26#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
27
28using namespace llvm;
29
30namespace {
31
32class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
33public:
34 static char ID;
35
36public:
37 AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {}
38
39 bool runOnMachineFunction(MachineFunction &MF) override;
40
41 StringRef getPassName() const override {
42 return "AMDGPU GlobalISel divergence lowering";
43 }
44
45 void getAnalysisUsage(AnalysisUsage &AU) const override {
46 AU.setPreservesCFG();
51 }
52};
53
54class DivergenceLoweringHelper : public PhiLoweringHelper {
55public:
56 DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
59
60private:
61 MachineUniformityInfo *MUI = nullptr;
63 Register buildRegCopyToLaneMask(Register Reg);
64
65public:
66 void markAsLaneMask(Register DstReg) const override;
67 void getCandidatesForLowering(
68 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
69 void collectIncomingValuesFromPhi(
70 const MachineInstr *MI,
71 SmallVectorImpl<Incoming> &Incomings) const override;
72 void replaceDstReg(Register NewReg, Register OldReg,
73 MachineBasicBlock *MBB) override;
74 void buildMergeLaneMasks(MachineBasicBlock &MBB,
76 Register DstReg, Register PrevReg,
77 Register CurReg) override;
78 void constrainAsLaneMask(Incoming &In) override;
79
80 bool lowerTemporalDivergence();
81 bool lowerTemporalDivergenceI1();
82};
83
84DivergenceLoweringHelper::DivergenceLoweringHelper(
87 : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
88
89// _(s1) -> SReg_32/64(s1)
90void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
91 assert(MRI->getType(DstReg) == LLT::scalar(1));
92
93 if (MRI->getRegClassOrNull(DstReg)) {
94 if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
95 return;
96 llvm_unreachable("Failed to constrain register class");
97 }
98
99 MRI->setRegClass(DstReg, ST->getBoolRC());
100}
101
102void DivergenceLoweringHelper::getCandidatesForLowering(
103 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
104 LLT S1 = LLT::scalar(1);
105
106 // Add divergent i1 G_PHIs to the list. Only consider G_PHI instructions,
107 // not PHI instructions that may have been created by earlier lowering stages
108 // (e.g., lowerTemporalDivergenceI1).
109 for (MachineBasicBlock &MBB : *MF) {
110 for (MachineInstr &MI : MBB.phis()) {
111 if (MI.getOpcode() != TargetOpcode::G_PHI)
112 continue;
113 Register Dst = MI.getOperand(0).getReg();
114 if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
115 Vreg1Phis.push_back(&MI);
116 }
117 }
118}
119
120void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
121 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
122 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
123 Incomings.emplace_back(MI->getOperand(i).getReg(),
124 MI->getOperand(i + 1).getMBB(), Register());
125 }
126}
127
128void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
130 BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
131 .addReg(NewReg);
132}
133
134// Copy Reg to new lane mask register, insert a copy after instruction that
135// defines Reg while skipping phis if needed.
136Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
137 Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);
138 MachineInstr *Instr = MRI->getVRegDef(Reg);
139 MachineBasicBlock *MBB = Instr->getParent();
140 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
141 B.buildCopy(LaneMask, Reg);
142 return LaneMask;
143}
144
145// bb.previous
146// %PrevReg = ...
147//
148// bb.current
149// %CurReg = ...
150//
151// %DstReg - not defined
152//
153// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
154//
155// bb.previous
156// %PrevReg = ...
157// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
158//
159// bb.current
160// %CurReg = ...
161// %CurRegCopy:sreg_32(s1) = COPY %CurReg
162// ...
163// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
164// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
165// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
166//
167// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
168void DivergenceLoweringHelper::buildMergeLaneMasks(
170 Register DstReg, Register PrevReg, Register CurReg) {
171 // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
172 // TODO: check if inputs are constants or results of a compare.
173
174 Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
175 Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
176 Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
177 Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
178
179 B.setInsertPt(MBB, I);
180 B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
181 B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
182 B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
183}
184
185// GlobalISel has to constrain S1 incoming taken as-is with lane mask register
186// class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
187// Incoming.Reg becomes that new lane mask.
188void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
189 B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
190
191 auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
192 MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
193 In.Reg = Copy.getReg(0);
194}
195
196void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
197 Register NewReg) {
198 for (MachineOperand &Op : Inst->operands()) {
199 if (Op.isReg() && Op.getReg() == Reg)
200 Op.setReg(NewReg);
201 }
202}
203
204bool DivergenceLoweringHelper::lowerTemporalDivergence() {
207
208 for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) {
209 if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
210 ILMA.isS32S64LaneMask(Reg))
211 continue;
212
213 Register CachedTDCopy = TDCache.lookup(Reg);
214 if (CachedTDCopy) {
215 replaceUsesOfRegInInstWith(Reg, UseInst, CachedTDCopy);
216 continue;
217 }
218
219 MachineInstr *Inst = MRI->getVRegDef(Reg);
221 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator())));
222
223 Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
224 B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
225 .addUse(ExecReg, RegState::Implicit);
226
227 replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
228 TDCache[Reg] = VgprReg;
229 }
230 return false;
231}
232
233bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
234 MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
235 initializeLaneMaskRegisterAttributes(BoolS1);
237
238 // In case of use outside muliple nested cycles or muliple uses we only need
239 // to merge lane mask across largest relevant cycle.
241 for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
242 if (MRI->getType(Reg) != LLT::scalar(1))
243 continue;
244
245 auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Reg);
246 auto &CycleMergedMask = LRCCacheIter->getSecond();
247 const MachineCycle *&CachedLRC = CycleMergedMask.first;
248 if (RegNotCached || LRC->contains(CachedLRC)) {
249 CachedLRC = LRC;
250 }
251 }
252
253 for (auto &LRCCacheEntry : LRCCache) {
254 Register Reg = LRCCacheEntry.first;
255 auto &CycleMergedMask = LRCCacheEntry.getSecond();
256 const MachineCycle *Cycle = CycleMergedMask.first;
257
258 Register MergedMask = MRI->createVirtualRegister(BoolS1);
259 SSAUpdater.Initialize(MergedMask);
260
261 MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent();
262 SSAUpdater.AddAvailableValue(MBB, MergedMask);
263
264 for (auto Entry : Cycle->getEntries()) {
265 for (MachineBasicBlock *Pred : Entry->predecessors()) {
266 if (!Cycle->contains(Pred)) {
267 B.setInsertPt(*Pred, Pred->getFirstTerminator());
268 auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
269 SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0));
270 }
271 }
272 }
273
274 buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask,
276
277 CycleMergedMask.second = MergedMask;
278 }
279
280 for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) {
281 if (MRI->getType(Reg) != LLT::scalar(1))
282 continue;
283
284 replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache.lookup(Reg).second);
285 }
286
287 return false;
288}
289
290} // End anonymous namespace.
291
292INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
293 "AMDGPU GlobalISel divergence lowering", false, false)
297INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
298 "AMDGPU GlobalISel divergence lowering", false, false)
299
300char AMDGPUGlobalISelDivergenceLowering::ID = 0;
301
303 AMDGPUGlobalISelDivergenceLowering::ID;
304
306 return new AMDGPUGlobalISelDivergenceLowering();
307}
308
309bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
310 MachineFunction &MF) {
312 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
314 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
316 getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
317
318 DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
319
320 bool Changed = false;
321 // Temporal divergence lowering needs to inspect list of instructions used
322 // outside cycle with divergent exit provided by uniformity analysis. Uniform
323 // instructions from the list require lowering, no instruction is deleted.
324 // Thus it needs to be run before lowerPhis that deletes phis that require
325 // lowering and replaces them with new instructions.
326
327 // Non-i1 temporal divergence lowering.
328 Changed |= Helper.lowerTemporalDivergence();
329 // This covers both uniform and divergent i1s. Lane masks are in sgpr and need
330 // to be updated in each iteration.
331 Changed |= Helper.lowerTemporalDivergenceI1();
332 // Temporal divergence lowering of divergent i1 phi used outside of the cycle
333 // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1
334 // since in some case lowerPhis does unnecessary lane mask merging.
335 Changed |= Helper.lowerPhis();
336 return Changed;
337}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define DEBUG_TYPE
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
This file declares the MachineIRBuilder class.
Register Reg
Machine IR instance of the generic uniformity analysis.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Interface definition of the PhiLoweringHelper class that implements lane mask merging algorithm for d...
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
A debug info location.
Definition DebugLoc.h:123
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
iterator_range< TemporalDivergenceTuple * > getTemporalDivergenceList() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Helper class to build MachineInstr.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
mop_range operands()
MachineOperand class - Representation of each machine instruction operand.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineSSAUpdater - This class updates SSA form for a set of virtual registers defined in multiple bl...
Legacy analysis pass which computes a MachineUniformityInfo.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
CycleInfo::CycleT Cycle
Definition CycleInfo.h:24
Register createLaneMaskReg(MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
DWARFExpression::Operation Op
FunctionPass * createAMDGPUGlobalISelDivergenceLoweringPass()
MachineCycleInfo::CycleT MachineCycle
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
All attributes(register class or bank and low-level type) a virtual register can have.