LLVM 20.0.0git
SIPreEmitPeephole.cpp
Go to the documentation of this file.
1//===-- SIPreEmitPeephole.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass performs the peephole optimizations before code emission.
11///
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
20
21using namespace llvm;
22
23#define DEBUG_TYPE "si-pre-emit-peephole"
24
25namespace {
26
27class SIPreEmitPeephole : public MachineFunctionPass {
28private:
29 const SIInstrInfo *TII = nullptr;
30 const SIRegisterInfo *TRI = nullptr;
31
32 bool optimizeVccBranch(MachineInstr &MI) const;
33 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
34 bool getBlockDestinations(MachineBasicBlock &SrcMBB,
35 MachineBasicBlock *&TrueMBB,
36 MachineBasicBlock *&FalseMBB,
38 bool mustRetainExeczBranch(const MachineInstr &Branch,
40 const MachineBasicBlock &To) const;
41 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
42
43public:
44 static char ID;
45
46 SIPreEmitPeephole() : MachineFunctionPass(ID) {
48 }
49
50 bool runOnMachineFunction(MachineFunction &MF) override;
51};
52
53} // End anonymous namespace.
54
55INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
56 "SI peephole optimizations", false, false)
57
58char SIPreEmitPeephole::ID = 0;
59
60char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
61
62bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
63 // Match:
64 // sreg = -1 or 0
65 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
66 // S_CBRANCH_VCC[N]Z
67 // =>
68 // S_CBRANCH_EXEC[N]Z
69 // We end up with this pattern sometimes after basic block placement.
70 // It happens while combining a block which assigns -1 or 0 to a saved mask
71 // and another block which consumes that saved mask and then a branch.
72 //
73 // While searching this also performs the following substitution:
74 // vcc = V_CMP
75 // vcc = S_AND exec, vcc
76 // S_CBRANCH_VCC[N]Z
77 // =>
78 // vcc = V_CMP
79 // S_CBRANCH_VCC[N]Z
80
81 bool Changed = false;
82 MachineBasicBlock &MBB = *MI.getParent();
84 const bool IsWave32 = ST.isWave32();
85 const unsigned CondReg = TRI->getVCC();
86 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
87 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
88 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
89 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
90
91 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
92 E = MBB.rend();
93 bool ReadsCond = false;
94 unsigned Threshold = 5;
95 for (++A; A != E; ++A) {
96 if (!--Threshold)
97 return false;
98 if (A->modifiesRegister(ExecReg, TRI))
99 return false;
100 if (A->modifiesRegister(CondReg, TRI)) {
101 if (!A->definesRegister(CondReg, TRI) ||
102 (A->getOpcode() != And && A->getOpcode() != AndN2))
103 return false;
104 break;
105 }
106 ReadsCond |= A->readsRegister(CondReg, TRI);
107 }
108 if (A == E)
109 return false;
110
111 MachineOperand &Op1 = A->getOperand(1);
112 MachineOperand &Op2 = A->getOperand(2);
113 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
114 TII->commuteInstruction(*A);
115 Changed = true;
116 }
117 if (Op1.getReg() != ExecReg)
118 return Changed;
119 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
120 return Changed;
121
122 int64_t MaskValue = 0;
123 Register SReg;
124 if (Op2.isReg()) {
125 SReg = Op2.getReg();
126 auto M = std::next(A);
127 bool ReadsSreg = false;
128 bool ModifiesExec = false;
129 for (; M != E; ++M) {
130 if (M->definesRegister(SReg, TRI))
131 break;
132 if (M->modifiesRegister(SReg, TRI))
133 return Changed;
134 ReadsSreg |= M->readsRegister(SReg, TRI);
135 ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
136 }
137 if (M == E)
138 return Changed;
139 // If SReg is VCC and SReg definition is a VALU comparison.
140 // This means S_AND with EXEC is not required.
141 // Erase the S_AND and return.
142 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
143 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
144 TII->isVOPC(*M)) {
145 A->eraseFromParent();
146 return true;
147 }
148 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
149 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
150 return Changed;
151 MaskValue = M->getOperand(1).getImm();
152 // First if sreg is only used in the AND instruction fold the immediate
153 // into the AND.
154 if (!ReadsSreg && Op2.isKill()) {
155 A->getOperand(2).ChangeToImmediate(MaskValue);
156 M->eraseFromParent();
157 }
158 } else if (Op2.isImm()) {
159 MaskValue = Op2.getImm();
160 } else {
161 llvm_unreachable("Op2 must be register or immediate");
162 }
163
164 // Invert mask for s_andn2
165 assert(MaskValue == 0 || MaskValue == -1);
166 if (A->getOpcode() == AndN2)
167 MaskValue = ~MaskValue;
168
169 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
170 if (!MI.killsRegister(CondReg, TRI)) {
171 // Replace AND with MOV
172 if (MaskValue == 0) {
173 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
174 .addImm(0);
175 } else {
176 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
177 .addReg(ExecReg);
178 }
179 }
180 // Remove AND instruction
181 A->eraseFromParent();
182 }
183
184 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
185 if (SReg == ExecReg) {
186 // EXEC is updated directly
187 if (IsVCCZ) {
188 MI.eraseFromParent();
189 return true;
190 }
191 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
192 } else if (IsVCCZ && MaskValue == 0) {
193 // Will always branch
194 // Remove all successors shadowed by new unconditional branch
195 MachineBasicBlock *Parent = MI.getParent();
197 bool Found = false;
198 for (MachineInstr &Term : Parent->terminators()) {
199 if (Found) {
200 if (Term.isBranch())
201 ToRemove.push_back(&Term);
202 } else {
203 Found = Term.isIdenticalTo(MI);
204 }
205 }
206 assert(Found && "conditional branch is not terminator");
207 for (auto *BranchMI : ToRemove) {
208 MachineOperand &Dst = BranchMI->getOperand(0);
209 assert(Dst.isMBB() && "destination is not basic block");
210 Parent->removeSuccessor(Dst.getMBB());
211 BranchMI->eraseFromParent();
212 }
213
214 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
215 Parent->removeSuccessor(Succ);
216 }
217
218 // Rewrite to unconditional branch
219 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
220 } else if (!IsVCCZ && MaskValue == 0) {
221 // Will never branch
222 MachineOperand &Dst = MI.getOperand(0);
223 assert(Dst.isMBB() && "destination is not basic block");
224 MI.getParent()->removeSuccessor(Dst.getMBB());
225 MI.eraseFromParent();
226 return true;
227 } else if (MaskValue == -1) {
228 // Depends only on EXEC
229 MI.setDesc(
230 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
231 }
232
233 MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/));
234 MI.addImplicitDefUseOperands(*MBB.getParent());
235
236 return true;
237}
238
239bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
240 MachineInstr &MI) const {
241 MachineBasicBlock &MBB = *MI.getParent();
242 const MachineFunction &MF = *MBB.getParent();
243 const MachineRegisterInfo &MRI = MF.getRegInfo();
244 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
245 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
247 bool IdxOn = true;
248
249 if (!MI.isIdenticalTo(First))
250 return false;
251
252 // Scan back to find an identical S_SET_GPR_IDX_ON
253 for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
254 E = MI.getIterator();
255 I != E; ++I) {
256 if (I->isBundle())
257 continue;
258 switch (I->getOpcode()) {
259 case AMDGPU::S_SET_GPR_IDX_MODE:
260 return false;
261 case AMDGPU::S_SET_GPR_IDX_OFF:
262 IdxOn = false;
263 ToRemove.push_back(&*I);
264 break;
265 default:
266 if (I->modifiesRegister(AMDGPU::M0, TRI))
267 return false;
268 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
269 return false;
270 if (llvm::any_of(I->operands(),
271 [&MRI, this](const MachineOperand &MO) {
272 return MO.isReg() &&
273 TRI->isVectorRegister(MRI, MO.getReg());
274 })) {
275 // The only exception allowed here is another indirect vector move
276 // with the same mode.
277 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
278 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
279 return false;
280 }
281 }
282 }
283
284 MI.eraseFromBundle();
285 for (MachineInstr *RI : ToRemove)
286 RI->eraseFromBundle();
287 return true;
288}
289
290bool SIPreEmitPeephole::getBlockDestinations(
291 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
293 if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
294 return false;
295
296 if (!FalseMBB)
297 FalseMBB = SrcMBB.getNextNode();
298
299 return true;
300}
301
302namespace {
303class BranchWeightCostModel {
304 const SIInstrInfo &TII;
305 const TargetSchedModel &SchedModel;
306 BranchProbability BranchProb;
307 static constexpr uint64_t BranchNotTakenCost = 1;
308 uint64_t BranchTakenCost;
309 uint64_t ThenCyclesCost = 0;
310
311public:
312 BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
313 const MachineBasicBlock &Succ)
314 : TII(TII), SchedModel(TII.getSchedModel()) {
315 const MachineBasicBlock &Head = *Branch.getParent();
316 const auto *FromIt = find(Head.successors(), &Succ);
317 assert(FromIt != Head.succ_end());
318
319 BranchProb = Head.getSuccProbability(FromIt);
320 if (BranchProb.isUnknown())
321 BranchProb = BranchProbability::getZero();
322 BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
323 }
324
325 bool isProfitable(const MachineInstr &MI) {
326 if (TII.isWaitcnt(MI.getOpcode()))
327 return false;
328
329 ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
330
331 // Consider `P = N/D` to be the probability of execz being false (skipping
332 // the then-block) The transformation is profitable if always executing the
333 // 'then' block is cheaper than executing sometimes 'then' and always
334 // executing s_cbranch_execz:
335 // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
336 // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
337 // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
338 // BranchNotTakenCost
339 uint64_t Numerator = BranchProb.getNumerator();
340 uint64_t Denominator = BranchProb.getDenominator();
341 return (Denominator - Numerator) * ThenCyclesCost <=
342 ((Denominator - Numerator) * BranchTakenCost +
343 Numerator * BranchNotTakenCost);
344 }
345};
346
347bool SIPreEmitPeephole::mustRetainExeczBranch(
348 const MachineInstr &Branch, const MachineBasicBlock &From,
349 const MachineBasicBlock &To) const {
350 assert(is_contained(Branch.getParent()->successors(), &From));
351 BranchWeightCostModel CostModel{*TII, Branch, From};
352
353 const MachineFunction *MF = From.getParent();
354 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
355 MBBI != End && MBBI != ToI; ++MBBI) {
356 const MachineBasicBlock &MBB = *MBBI;
357
358 for (const MachineInstr &MI : MBB) {
359 // When a uniform loop is inside non-uniform control flow, the branch
360 // leaving the loop might never be taken when EXEC = 0.
361 // Hence we should retain cbranch out of the loop lest it become infinite.
362 if (MI.isConditionalBranch())
363 return true;
364
365 if (MI.isUnconditionalBranch() &&
366 TII->getBranchDestBlock(MI) != MBB.getNextNode())
367 return true;
368
369 if (MI.isMetaInstruction())
370 continue;
371
372 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
373 return true;
374
375 if (!CostModel.isProfitable(MI))
376 return true;
377 }
378 }
379
380 return false;
381}
382} // namespace
383
384// Returns true if the skip branch instruction is removed.
385bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
386 MachineBasicBlock &SrcMBB) {
387
388 if (!TII->getSchedModel().hasInstrSchedModel())
389 return false;
390
391 MachineBasicBlock *TrueMBB = nullptr;
392 MachineBasicBlock *FalseMBB = nullptr;
394
395 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
396 return false;
397
398 // Consider only the forward branches.
399 if (SrcMBB.getNumber() >= TrueMBB->getNumber())
400 return false;
401
402 // Consider only when it is legal and profitable
403 if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
404 return false;
405
406 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
407 MI.eraseFromParent();
408 SrcMBB.removeSuccessor(TrueMBB);
409
410 return true;
411}
412
413bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
415 TII = ST.getInstrInfo();
416 TRI = &TII->getRegisterInfo();
417 bool Changed = false;
418
419 MF.RenumberBlocks();
420
421 for (MachineBasicBlock &MBB : MF) {
423 // Check first terminator for branches to optimize
424 if (TermI != MBB.end()) {
425 MachineInstr &MI = *TermI;
426 switch (MI.getOpcode()) {
427 case AMDGPU::S_CBRANCH_VCCZ:
428 case AMDGPU::S_CBRANCH_VCCNZ:
429 Changed |= optimizeVccBranch(MI);
430 break;
431 case AMDGPU::S_CBRANCH_EXECZ:
432 Changed |= removeExeczBranch(MI, MBB);
433 break;
434 }
435 }
436
437 if (!ST.hasVGPRIndexMode())
438 continue;
439
440 MachineInstr *SetGPRMI = nullptr;
441 const unsigned Threshold = 20;
442 unsigned Count = 0;
443 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
444 // second is not needed. Do expensive checks in the optimizeSetGPR()
445 // and limit the distance to 20 instructions for compile time purposes.
446 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
447 // may be bundled with the instructions they modify.
448 for (auto &MI : make_early_inc_range(MBB.instrs())) {
449 if (Count == Threshold)
450 SetGPRMI = nullptr;
451 else
452 ++Count;
453
454 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
455 continue;
456
457 Count = 0;
458 if (!SetGPRMI) {
459 SetGPRMI = &MI;
460 continue;
461 }
462
463 if (optimizeSetGPR(*SetGPRMI, MI))
464 Changed = true;
465 else
466 SetGPRMI = &MI;
467 }
468 }
469
470 return Changed;
471}
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
BlockVerifier::State From
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
#define DEBUG_TYPE
static bool isProfitable(const SmallVector< std::unique_ptr< StableFunctionMap::StableFunctionEntry > > &SFS)
static uint32_t getDenominator()
uint32_t getNumerator() const
static BranchProbability getZero()
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
reverse_iterator rend()
MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
BranchProbability getSuccProbability(const_succ_iterator Succ) const
Return probability of the edge from this block to MBB.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
BasicBlockListType::const_iterator const_iterator
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Provide an instruction scheduling machine model to CodeGen passes.
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
void initializeSIPreEmitPeepholePass(PassRegistry &)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
char & SIPreEmitPeepholeID
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
@ And
Bitwise or logical AND of integers.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903