Line data Source code
1 : //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// This pass removes redundant S_OR_B64 instructions enabling lanes in
12 : /// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
13 : /// vector instructions between them we can only keep outer SI_END_CF, given
14 : /// that CFG is structured and exec bits of the outer end statement are always
15 : /// not less than exec bit of the inner one.
16 : ///
17 : /// This needs to be done before the RA to eliminate saved exec bits registers
18 : /// but after register coalescer to have no vector registers copies in between
19 : /// of different end cf statements.
20 : ///
21 : //===----------------------------------------------------------------------===//
22 :
23 : #include "AMDGPU.h"
24 : #include "AMDGPUSubtarget.h"
25 : #include "SIInstrInfo.h"
26 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
27 : #include "llvm/CodeGen/LiveIntervals.h"
28 : #include "llvm/CodeGen/MachineFunctionPass.h"
29 :
30 : using namespace llvm;
31 :
32 : #define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
33 :
34 : namespace {
35 :
36 : class SIOptimizeExecMaskingPreRA : public MachineFunctionPass {
37 : public:
38 : static char ID;
39 :
40 : public:
41 1912 : SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) {
42 1912 : initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
43 1912 : }
44 :
45 : bool runOnMachineFunction(MachineFunction &MF) override;
46 :
47 1912 : StringRef getPassName() const override {
48 1912 : return "SI optimize exec mask operations pre-RA";
49 : }
50 :
51 1912 : void getAnalysisUsage(AnalysisUsage &AU) const override {
52 : AU.addRequired<LiveIntervals>();
53 : AU.setPreservesAll();
54 1912 : MachineFunctionPass::getAnalysisUsage(AU);
55 1912 : }
56 : };
57 :
58 : } // End anonymous namespace.
59 :
60 85105 : INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
61 : "SI optimize exec mask operations pre-RA", false, false)
62 85105 : INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
63 200936 : INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
64 : "SI optimize exec mask operations pre-RA", false, false)
65 :
66 : char SIOptimizeExecMaskingPreRA::ID = 0;
67 :
68 : char &llvm::SIOptimizeExecMaskingPreRAID = SIOptimizeExecMaskingPreRA::ID;
69 :
70 0 : FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
71 0 : return new SIOptimizeExecMaskingPreRA();
72 : }
73 :
74 1317 : static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) {
75 2743 : return MI.getOpcode() == AMDGPU::S_OR_B64 &&
76 1317 : MI.modifiesRegister(AMDGPU::EXEC, TRI);
77 : }
78 :
79 : static bool isFullExecCopy(const MachineInstr& MI) {
80 7 : return MI.isFullCopy() && MI.getOperand(1).getReg() == AMDGPU::EXEC;
81 : }
82 :
83 13 : static unsigned getOrNonExecReg(const MachineInstr &MI,
84 : const SIInstrInfo &TII) {
85 : auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1);
86 13 : if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
87 : return Op->getReg();
88 : Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0);
89 0 : if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
90 : return Op->getReg();
91 : return AMDGPU::NoRegister;
92 : }
93 :
94 9 : static MachineInstr* getOrExecSource(const MachineInstr &MI,
95 : const SIInstrInfo &TII,
96 : const MachineRegisterInfo &MRI) {
97 9 : auto SavedExec = getOrNonExecReg(MI, TII);
98 9 : if (SavedExec == AMDGPU::NoRegister)
99 : return nullptr;
100 9 : auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec);
101 9 : if (!SaveExecInst || !isFullExecCopy(*SaveExecInst))
102 2 : return nullptr;
103 : return SaveExecInst;
104 : }
105 :
106 19561 : bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
107 19561 : if (skipFunction(MF.getFunction()))
108 : return false;
109 :
110 19558 : const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
111 19558 : const SIRegisterInfo *TRI = ST.getRegisterInfo();
112 19558 : const SIInstrInfo *TII = ST.getInstrInfo();
113 19558 : MachineRegisterInfo &MRI = MF.getRegInfo();
114 19558 : LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
115 19558 : DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
116 : bool Changed = false;
117 :
118 41575 : for (MachineBasicBlock &MBB : MF) {
119 :
120 : // Try to remove unneeded instructions before s_endpgm.
121 22017 : if (MBB.succ_empty()) {
122 19583 : if (MBB.empty())
123 : continue;
124 :
125 : // Skip this if the endpgm has any implicit uses, otherwise we would need
126 : // to be careful to update / remove them.
127 : MachineInstr &Term = MBB.back();
128 39106 : if (Term.getOpcode() != AMDGPU::S_ENDPGM ||
129 16669 : Term.getNumOperands() != 0)
130 : continue;
131 :
132 16668 : SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
133 :
134 33620 : while (!Blocks.empty()) {
135 : auto CurBB = Blocks.pop_back_val();
136 : auto I = CurBB->rbegin(), E = CurBB->rend();
137 16952 : if (I != E) {
138 16941 : if (I->isUnconditionalBranch() || I->getOpcode() == AMDGPU::S_ENDPGM)
139 : ++I;
140 223 : else if (I->isBranch())
141 : continue;
142 : }
143 :
144 17081 : while (I != E) {
145 : if (I->isDebugInstr()) {
146 1 : I = std::next(I);
147 : continue;
148 : }
149 :
150 18994 : if (I->mayStore() || I->isBarrier() || I->isCall() ||
151 17364 : I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
152 : break;
153 :
154 : LLVM_DEBUG(dbgs()
155 : << "Removing no effect instruction: " << *I << '\n');
156 :
157 626 : for (auto &Op : I->operands()) {
158 498 : if (Op.isReg())
159 471 : RecalcRegs.insert(Op.getReg());
160 : }
161 :
162 128 : auto Next = std::next(I);
163 128 : LIS->RemoveMachineInstrFromMaps(*I);
164 128 : I->eraseFromParent();
165 : I = Next;
166 :
167 : Changed = true;
168 : }
169 :
170 16952 : if (I != E)
171 : continue;
172 :
173 : // Try to ascend predecessors.
174 1312 : for (auto *Pred : CurBB->predecessors()) {
175 518 : if (Pred->succ_size() == 1)
176 284 : Blocks.push_back(Pred);
177 : }
178 : }
179 : continue;
180 : }
181 :
182 : // Try to collapse adjacent endifs.
183 : auto Lead = MBB.begin(), E = MBB.end();
184 2434 : if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI))
185 2330 : continue;
186 :
187 104 : const MachineBasicBlock* Succ = *MBB.succ_begin();
188 104 : if (!MBB.isLayoutSuccessor(Succ))
189 : continue;
190 :
191 67 : auto I = std::next(Lead);
192 :
193 142 : for ( ; I != E; ++I)
194 207 : if (!TII->isSALU(*I) || I->readsRegister(AMDGPU::EXEC, TRI))
195 : break;
196 :
197 67 : if (I != E)
198 : continue;
199 :
200 : const auto NextLead = Succ->begin();
201 16 : if (NextLead == Succ->end() || !isEndCF(*NextLead, TRI) ||
202 5 : !getOrExecSource(*NextLead, *TII, MRI))
203 7 : continue;
204 :
205 : LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
206 :
207 4 : auto SaveExec = getOrExecSource(*Lead, *TII, MRI);
208 4 : unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII);
209 20 : for (auto &Op : Lead->operands()) {
210 16 : if (Op.isReg())
211 16 : RecalcRegs.insert(Op.getReg());
212 : }
213 :
214 4 : LIS->RemoveMachineInstrFromMaps(*Lead);
215 4 : Lead->eraseFromParent();
216 4 : if (SaveExecReg) {
217 4 : LIS->removeInterval(SaveExecReg);
218 : LIS->createAndComputeVirtRegInterval(SaveExecReg);
219 : }
220 :
221 : Changed = true;
222 :
223 : // If the only use of saved exec in the removed instruction is S_AND_B64
224 : // fold the copy now.
225 4 : if (!SaveExec || !SaveExec->isFullCopy())
226 : continue;
227 :
228 3 : unsigned SavedExec = SaveExec->getOperand(0).getReg();
229 : bool SafeToReplace = true;
230 6 : for (auto& U : MRI.use_nodbg_instructions(SavedExec)) {
231 3 : if (U.getParent() != SaveExec->getParent()) {
232 : SafeToReplace = false;
233 : break;
234 : }
235 :
236 : LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
237 : }
238 :
239 3 : if (SafeToReplace) {
240 3 : LIS->RemoveMachineInstrFromMaps(*SaveExec);
241 3 : SaveExec->eraseFromParent();
242 3 : MRI.replaceRegWith(SavedExec, AMDGPU::EXEC);
243 3 : LIS->removeInterval(SavedExec);
244 : }
245 : }
246 :
247 19558 : if (Changed) {
248 670 : for (auto Reg : RecalcRegs) {
249 563 : if (TargetRegisterInfo::isVirtualRegister(Reg)) {
250 122 : LIS->removeInterval(Reg);
251 122 : if (!MRI.reg_empty(Reg))
252 : LIS->createAndComputeVirtRegInterval(Reg);
253 : } else {
254 1443 : for (MCRegUnitIterator U(Reg, TRI); U.isValid(); ++U)
255 561 : LIS->removeRegUnit(*U);
256 : }
257 : }
258 : }
259 :
260 : return Changed;
261 : }
|