Line data Source code
1 : //===-- SIFormMemoryClauses.cpp -------------------------------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// This pass creates bundles of SMEM and VMEM instructions forming memory
12 : /// clauses if XNACK is enabled. Def operands of clauses are marked as early
13 : /// clobber to make sure we will not override any source within a clause.
14 : ///
15 : //===----------------------------------------------------------------------===//
16 :
17 : #include "AMDGPU.h"
18 : #include "AMDGPUSubtarget.h"
19 : #include "GCNRegPressure.h"
20 : #include "SIInstrInfo.h"
21 : #include "SIMachineFunctionInfo.h"
22 : #include "SIRegisterInfo.h"
23 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 : #include "llvm/ADT/DenseMap.h"
25 : #include "llvm/CodeGen/LiveIntervals.h"
26 : #include "llvm/CodeGen/MachineFunctionPass.h"
27 :
28 : using namespace llvm;
29 :
30 : #define DEBUG_TYPE "si-form-memory-clauses"
31 :
32 : // Clauses longer then 15 instructions would overflow one of the counters
33 : // and stall. They can stall even earlier if there are outstanding counters.
34 : static cl::opt<unsigned>
35 : MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15),
36 : cl::desc("Maximum length of a memory clause, instructions"));
37 :
38 : namespace {
39 :
40 : class SIFormMemoryClauses : public MachineFunctionPass {
41 : typedef DenseMap<unsigned, std::pair<unsigned, LaneBitmask>> RegUse;
42 :
43 : public:
44 : static char ID;
45 :
46 : public:
47 1912 : SIFormMemoryClauses() : MachineFunctionPass(ID) {
48 1912 : initializeSIFormMemoryClausesPass(*PassRegistry::getPassRegistry());
49 1912 : }
50 :
51 : bool runOnMachineFunction(MachineFunction &MF) override;
52 :
53 1912 : StringRef getPassName() const override {
54 1912 : return "SI Form memory clauses";
55 : }
56 :
57 1912 : void getAnalysisUsage(AnalysisUsage &AU) const override {
58 : AU.addRequired<LiveIntervals>();
59 : AU.setPreservesAll();
60 1912 : MachineFunctionPass::getAnalysisUsage(AU);
61 1912 : }
62 :
63 : private:
64 : template <typename Callable>
65 : void forAllLanes(unsigned Reg, LaneBitmask LaneMask, Callable Func) const;
66 :
67 : bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
68 : bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
69 : void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
70 : bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
71 : GCNDownwardRPTracker &RPT);
72 :
73 : const GCNSubtarget *ST;
74 : const SIRegisterInfo *TRI;
75 : const MachineRegisterInfo *MRI;
76 : SIMachineFunctionInfo *MFI;
77 :
78 : unsigned LastRecordedOccupancy;
79 : unsigned MaxVGPRs;
80 : unsigned MaxSGPRs;
81 : };
82 :
83 : } // End anonymous namespace.
84 :
85 85105 : INITIALIZE_PASS_BEGIN(SIFormMemoryClauses, DEBUG_TYPE,
86 : "SI Form memory clauses", false, false)
87 85105 : INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
88 200936 : INITIALIZE_PASS_END(SIFormMemoryClauses, DEBUG_TYPE,
89 : "SI Form memory clauses", false, false)
90 :
91 :
92 : char SIFormMemoryClauses::ID = 0;
93 :
94 : char &llvm::SIFormMemoryClausesID = SIFormMemoryClauses::ID;
95 :
96 0 : FunctionPass *llvm::createSIFormMemoryClausesPass() {
97 0 : return new SIFormMemoryClauses();
98 : }
99 :
100 : static bool isVMEMClauseInst(const MachineInstr &MI) {
101 3393 : return SIInstrInfo::isFLAT(MI) || SIInstrInfo::isVMEM(MI);
102 : }
103 :
104 : static bool isSMEMClauseInst(const MachineInstr &MI) {
105 : return SIInstrInfo::isSMRD(MI);
106 : }
107 :
108 : // There no sense to create store clauses, they do not define anything,
109 : // thus there is nothing to set early-clobber.
110 3593 : static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
111 3593 : if (MI.isDebugValue() || MI.isBundled())
112 : return false;
113 3593 : if (!MI.mayLoad() || MI.mayStore())
114 3120 : return false;
115 946 : if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 ||
116 473 : AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1)
117 : return false;
118 473 : if (IsVMEMClause && !isVMEMClauseInst(MI))
119 : return false;
120 473 : if (!IsVMEMClause && !isSMEMClauseInst(MI))
121 0 : return false;
122 : return true;
123 : }
124 :
125 1374 : static unsigned getMopState(const MachineOperand &MO) {
126 : unsigned S = 0;
127 1374 : if (MO.isImplicit())
128 : S |= RegState::Implicit;
129 1374 : if (MO.isDead())
130 32 : S |= RegState::Dead;
131 1374 : if (MO.isUndef())
132 13 : S |= RegState::Undef;
133 1374 : if (MO.isKill())
134 0 : S |= RegState::Kill;
135 1374 : if (MO.isEarlyClobber())
136 0 : S |= RegState::EarlyClobber;
137 2748 : if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
138 0 : S |= RegState::Renamable;
139 1374 : return S;
140 : }
141 :
142 : template <typename Callable>
143 235 : void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
144 : Callable Func) const {
145 240 : if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) ||
146 5 : LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
147 : Func(0);
148 234 : return;
149 : }
150 :
151 1 : const TargetRegisterClass *RC = MRI->getRegClass(Reg);
152 1 : unsigned E = TRI->getNumSubRegIndices();
153 : SmallVector<unsigned, AMDGPU::NUM_TARGET_SUBREGS> CoveringSubregs;
154 68 : for (unsigned Idx = 1; Idx < E; ++Idx) {
155 : // Is this index even compatible with the given class?
156 67 : if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
157 : continue;
158 9 : LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
159 : // Early exit if we found a perfect match.
160 9 : if (SubRegMask == LaneMask) {
161 0 : Func(Idx);
162 : return;
163 : }
164 :
165 9 : if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
166 : continue;
167 :
168 4 : CoveringSubregs.push_back(Idx);
169 : }
170 :
171 : llvm::sort(CoveringSubregs, [this](unsigned A, unsigned B) {
172 : LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
173 : LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
174 : unsigned NA = MaskA.getNumLanes();
175 : unsigned NB = MaskB.getNumLanes();
176 : if (NA != NB)
177 : return NA > NB;
178 : return MaskA.getHighestLane() > MaskB.getHighestLane();
179 : });
180 :
181 2 : for (unsigned Idx : CoveringSubregs) {
182 2 : LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
183 2 : if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
184 : continue;
185 :
186 2 : Func(Idx);
187 : LaneMask &= ~SubRegMask;
188 2 : if (LaneMask.none())
189 : return;
190 : }
191 :
192 0 : llvm_unreachable("Failed to find all subregs to cover lane mask");
193 : }
194 82 :
195 : // Returns false if there is a use of a def already in the map.
196 84 : // In this case we must break the clause.
197 2 : bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
198 : RegUse &Defs, RegUse &Uses) const {
199 82 : // Check interference with defs.
200 : for (const MachineOperand &MO : MI.operands()) {
201 : // TODO: Prologue/Epilogue Insertion pass does not process bundled
202 0 : // instructions.
203 0 : if (MO.isFI())
204 : return false;
205 0 :
206 : if (!MO.isReg())
207 0 : continue;
208 :
209 0 : unsigned Reg = MO.getReg();
210 :
211 0 : // If it is tied we will need to write same register as we read.
212 : if (MO.isTied())
213 : return false;
214 :
215 : RegUse &Map = MO.isDef() ? Uses : Defs;
216 0 : auto Conflict = Map.find(Reg);
217 : if (Conflict == Map.end())
218 : continue;
219 0 :
220 : if (TargetRegisterInfo::isPhysicalRegister(Reg))
221 : return false;
222 :
223 : LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
224 : if ((Conflict->second.second & Mask).any())
225 : return false;
226 : }
227 :
228 : return true;
229 : }
230 :
231 : // Since all defs in the clause are early clobber we can run out of registers.
232 0 : // Function returns false if pressure would hit the limit if instruction is
233 0 : // bundled into a memory clause.
234 0 : bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
235 : GCNDownwardRPTracker &RPT) {
236 : // NB: skip advanceBeforeNext() call. Since all defs will be marked
237 : // early-clobber they will all stay alive at least to the end of the
238 : // clause. Therefor we should not decrease pressure even if load
239 0 : // pointer becomes dead and could otherwise be reused for destination.
240 : RPT.advanceToNext();
241 : GCNRegPressure MaxPressure = RPT.moveMaxPressure();
242 : unsigned Occupancy = MaxPressure.getOccupancy(*ST);
243 0 : if (Occupancy >= MFI->getMinAllowedOccupancy() &&
244 : MaxPressure.getVGPRNum() <= MaxVGPRs &&
245 153 : MaxPressure.getSGPRNum() <= MaxSGPRs) {
246 : LastRecordedOccupancy = Occupancy;
247 156 : return true;
248 3 : }
249 : return false;
250 152 : }
251 :
252 : // Collect register defs and uses along with their lane masks and states.
253 1 : void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
254 1 : RegUse &Defs, RegUse &Uses) const {
255 : for (const MachineOperand &MO : MI.operands()) {
256 68 : if (!MO.isReg())
257 : continue;
258 67 : unsigned Reg = MO.getReg();
259 : if (!Reg)
260 9 : continue;
261 :
262 9 : LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ?
263 0 : TRI->getSubRegIndexLaneMask(MO.getSubReg()) :
264 : LaneBitmask::getAll();
265 : RegUse &Map = MO.isDef() ? Defs : Uses;
266 :
267 9 : auto Loc = Map.find(Reg);
268 : unsigned State = getMopState(MO);
269 : if (Loc == Map.end()) {
270 4 : Map[Reg] = std::make_pair(State, Mask);
271 : } else {
272 : Loc->second.first |= State;
273 : Loc->second.second |= Mask;
274 : }
275 : }
276 : }
277 :
278 : // Check register def/use conflicts, occupancy limits and collect def/use maps.
279 : // Return true if instruction can be bundled with previous. It it cannot
280 : // def/use maps are not updated.
281 : bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
282 : RegUse &Defs, RegUse &Uses,
283 2 : GCNDownwardRPTracker &RPT) {
284 2 : if (!canBundle(MI, Defs, Uses))
285 2 : return false;
286 :
287 : if (!checkPressure(MI, RPT))
288 2 : return false;
289 :
290 2 : collectRegUses(MI, Defs, Uses);
291 : return true;
292 : }
293 :
294 0 : bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
295 : if (skipFunction(MF.getFunction()))
296 : return false;
297 :
298 : ST = &MF.getSubtarget<GCNSubtarget>();
299 0 : if (!ST->isXNACKEnabled())
300 : return false;
301 :
302 0 : const SIInstrInfo *TII = ST->getInstrInfo();
303 : TRI = ST->getRegisterInfo();
304 : MRI = &MF.getRegInfo();
305 0 : MFI = MF.getInfo<SIMachineFunctionInfo>();
306 0 : LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
307 : SlotIndexes *Ind = LIS->getSlotIndexes();
308 0 : bool Changed = false;
309 0 :
310 : MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count();
311 0 : MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count();
312 :
313 : for (MachineBasicBlock &MBB : MF) {
314 0 : MachineBasicBlock::instr_iterator Next;
315 0 : for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) {
316 : MachineInstr &MI = *I;
317 0 : Next = std::next(I);
318 0 :
319 0 : bool IsVMEM = isVMEMClauseInst(MI);
320 0 :
321 : if (!isValidClauseInst(MI, IsVMEM))
322 0 : continue;
323 0 :
324 : RegUse Defs, Uses;
325 0 : GCNDownwardRPTracker RPT(*LIS);
326 0 : RPT.reset(MI);
327 0 :
328 : if (!processRegUses(MI, Defs, Uses, RPT))
329 : continue;
330 :
331 : unsigned Length = 1;
332 : for ( ; Next != E && Length < MaxClause; ++Next) {
333 : if (!isValidClauseInst(*Next, IsVMEM))
334 : break;
335 :
336 0 : // A load from pointer which was loaded inside the same bundle is an
337 : // impossible clause because we will need to write and read the same
338 : // register inside. In this case processRegUses will return false.
339 : if (!processRegUses(*Next, Defs, Uses, RPT))
340 : break;
341 :
342 0 : ++Length;
343 0 : }
344 0 : if (Length < 2)
345 0 : continue;
346 0 :
347 0 : Changed = true;
348 0 : MFI->limitOccupancy(LastRecordedOccupancy);
349 0 :
350 : auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE));
351 : Ind->insertMachineInstrInMaps(*B);
352 :
353 : for (auto BI = I; BI != Next; ++BI) {
354 : BI->bundleWithPred();
355 0 : Ind->removeSingleMachineInstrFromMaps(*BI);
356 :
357 0 : for (MachineOperand &MO : BI->defs())
358 0 : if (MO.readsReg())
359 0 : MO.setIsInternalRead(true);
360 0 : }
361 0 :
362 0 : for (auto &&R : Defs) {
363 : forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
364 : unsigned S = R.second.first | RegState::EarlyClobber;
365 0 : if (!SubReg)
366 0 : S &= ~(RegState::Undef | RegState::Dead);
367 0 : B.addDef(R.first, S, SubReg);
368 : });
369 0 : }
370 0 :
371 0 : for (auto &&R : Uses) {
372 0 : forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
373 : B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg);
374 0 : });
375 : }
376 :
377 : for (auto &&R : Defs) {
378 0 : unsigned Reg = R.first;
379 : Uses.erase(Reg);
380 : if (TargetRegisterInfo::isPhysicalRegister(Reg))
381 : continue;
382 : LIS->removeInterval(Reg);
383 473 : LIS->createAndComputeVirtRegInterval(Reg);
384 : }
385 :
386 473 : for (auto &&R : Uses) {
387 : unsigned Reg = R.first;
388 : if (TargetRegisterInfo::isPhysicalRegister(Reg))
389 464 : continue;
390 : LIS->removeInterval(Reg);
391 : LIS->createAndComputeVirtRegInterval(Reg);
392 463 : }
393 463 : }
394 : }
395 :
396 19559 : return Changed;
397 19559 : }
|