File: | llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp |
Warning: | line 926, column 44 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | /// \file | |||
9 | /// Finalize v8.1-m low-overhead loops by converting the associated pseudo | |||
10 | /// instructions into machine operations. | |||
11 | /// The expectation is that the loop contains three pseudo instructions: | |||
12 | /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop | |||
13 | /// form should be in the preheader, whereas the while form should be in the | |||
14 | /// preheaders only predecessor. | |||
15 | /// - t2LoopDec - placed within in the loop body. | |||
16 | /// - t2LoopEnd - the loop latch terminator. | |||
17 | /// | |||
18 | /// In addition to this, we also look for the presence of the VCTP instruction, | |||
19 | /// which determines whether we can generated the tail-predicated low-overhead | |||
20 | /// loop form. | |||
21 | /// | |||
22 | /// Assumptions and Dependencies: | |||
23 | /// Low-overhead loops are constructed and executed using a setup instruction: | |||
24 | /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP. | |||
25 | /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range | |||
26 | /// but fixed polarity: WLS can only branch forwards and LE can only branch | |||
27 | /// backwards. These restrictions mean that this pass is dependent upon block | |||
28 | /// layout and block sizes, which is why it's the last pass to run. The same is | |||
29 | /// true for ConstantIslands, but this pass does not increase the size of the | |||
30 | /// basic blocks, nor does it change the CFG. Instructions are mainly removed | |||
31 | /// during the transform and pseudo instructions are replaced by real ones. In | |||
32 | /// some cases, when we have to revert to a 'normal' loop, we have to introduce | |||
33 | /// multiple instructions for a single pseudo (see RevertWhile and | |||
34 | /// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd | |||
35 | /// are defined to be as large as this maximum sequence of replacement | |||
36 | /// instructions. | |||
37 | /// | |||
38 | //===----------------------------------------------------------------------===// | |||
39 | ||||
40 | #include "ARM.h" | |||
41 | #include "ARMBaseInstrInfo.h" | |||
42 | #include "ARMBaseRegisterInfo.h" | |||
43 | #include "ARMBasicBlockInfo.h" | |||
44 | #include "ARMSubtarget.h" | |||
45 | #include "llvm/ADT/SetOperations.h" | |||
46 | #include "llvm/ADT/SmallSet.h" | |||
47 | #include "llvm/CodeGen/MachineFunctionPass.h" | |||
48 | #include "llvm/CodeGen/MachineLoopInfo.h" | |||
49 | #include "llvm/CodeGen/MachineLoopUtils.h" | |||
50 | #include "llvm/CodeGen/MachineRegisterInfo.h" | |||
51 | #include "llvm/CodeGen/Passes.h" | |||
52 | #include "llvm/CodeGen/ReachingDefAnalysis.h" | |||
53 | #include "llvm/MC/MCInstrDesc.h" | |||
54 | ||||
55 | using namespace llvm; | |||
56 | ||||
57 | #define DEBUG_TYPE"arm-low-overhead-loops" "arm-low-overhead-loops" | |||
58 | #define ARM_LOW_OVERHEAD_LOOPS_NAME"ARM Low Overhead Loops pass" "ARM Low Overhead Loops pass" | |||
59 | ||||
60 | namespace { | |||
61 | ||||
62 | struct PredicatedMI { | |||
63 | MachineInstr *MI = nullptr; | |||
64 | SetVector<MachineInstr*> Predicates; | |||
65 | ||||
66 | public: | |||
67 | PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) : | |||
68 | MI(I) { | |||
69 | Predicates.insert(Preds.begin(), Preds.end()); | |||
70 | } | |||
71 | }; | |||
72 | ||||
73 | // Represent a VPT block, a list of instructions that begins with a VPST and | |||
74 | // has a maximum of four proceeding instructions. All instructions within the | |||
75 | // block are predicated upon the vpr and we allow instructions to define the | |||
76 | // vpr within in the block too. | |||
77 | class VPTBlock { | |||
78 | std::unique_ptr<PredicatedMI> VPST; | |||
79 | PredicatedMI *Divergent = nullptr; | |||
80 | SmallVector<PredicatedMI, 4> Insts; | |||
81 | ||||
82 | public: | |||
83 | VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) { | |||
84 | VPST = std::make_unique<PredicatedMI>(MI, Preds); | |||
85 | } | |||
86 | ||||
87 | void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) { | |||
88 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Adding predicated MI: " << *MI; } } while (false); | |||
89 | if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) { | |||
90 | Divergent = &Insts.back(); | |||
91 | LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << " - has divergent predicate: " << *Divergent->MI; } } while (false); | |||
92 | } | |||
93 | Insts.emplace_back(MI, Preds); | |||
94 | assert(Insts.size() <= 4 && "Too many instructions in VPT block!")((Insts.size() <= 4 && "Too many instructions in VPT block!" ) ? static_cast<void> (0) : __assert_fail ("Insts.size() <= 4 && \"Too many instructions in VPT block!\"" , "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp" , 94, __PRETTY_FUNCTION__)); | |||
95 | } | |||
96 | ||||
97 | // Have we found an instruction within the block which defines the vpr? If | |||
98 | // so, not all the instructions in the block will have the same predicate. | |||
99 | bool HasNonUniformPredicate() const { | |||
100 | return Divergent != nullptr; | |||
101 | } | |||
102 | ||||
103 | // Is the given instruction part of the predicate set controlling the entry | |||
104 | // to the block. | |||
105 | bool IsPredicatedOn(MachineInstr *MI) const { | |||
106 | return VPST->Predicates.count(MI); | |||
107 | } | |||
108 | ||||
109 | // Is the given instruction the only predicate which controls the entry to | |||
110 | // the block. | |||
111 | bool IsOnlyPredicatedOn(MachineInstr *MI) const { | |||
112 | return IsPredicatedOn(MI) && VPST->Predicates.size() == 1; | |||
113 | } | |||
114 | ||||
115 | unsigned size() const { return Insts.size(); } | |||
116 | SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; } | |||
117 | MachineInstr *getVPST() const { return VPST->MI; } | |||
118 | PredicatedMI *getDivergent() const { return Divergent; } | |||
119 | }; | |||
120 | ||||
121 | struct LowOverheadLoop { | |||
122 | ||||
123 | MachineLoop *ML = nullptr; | |||
124 | MachineFunction *MF = nullptr; | |||
125 | MachineInstr *InsertPt = nullptr; | |||
126 | MachineInstr *Start = nullptr; | |||
127 | MachineInstr *Dec = nullptr; | |||
128 | MachineInstr *End = nullptr; | |||
129 | MachineInstr *VCTP = nullptr; | |||
130 | VPTBlock *CurrentBlock = nullptr; | |||
131 | SetVector<MachineInstr*> CurrentPredicate; | |||
132 | SmallVector<VPTBlock, 4> VPTBlocks; | |||
133 | bool Revert = false; | |||
134 | bool CannotTailPredicate = false; | |||
135 | ||||
136 | LowOverheadLoop(MachineLoop *ML) : ML(ML) { | |||
137 | MF = ML->getHeader()->getParent(); | |||
138 | } | |||
139 | ||||
140 | bool RecordVPTBlocks(MachineInstr *MI); | |||
141 | ||||
142 | // If this is an MVE instruction, check that we know how to use tail | |||
143 | // predication with it. | |||
144 | void AnalyseMVEInst(MachineInstr *MI) { | |||
145 | if (CannotTailPredicate) | |||
146 | return; | |||
147 | ||||
148 | if (!RecordVPTBlocks(MI)) { | |||
149 | CannotTailPredicate = true; | |||
150 | return; | |||
151 | } | |||
152 | ||||
153 | const MCInstrDesc &MCID = MI->getDesc(); | |||
154 | uint64_t Flags = MCID.TSFlags; | |||
155 | if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) | |||
156 | return; | |||
157 | ||||
158 | if ((Flags & ARMII::ValidForTailPredication) == 0) { | |||
159 | LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Can't tail predicate: " << *MI; } } while (false); | |||
160 | CannotTailPredicate = true; | |||
161 | } | |||
162 | } | |||
163 | ||||
164 | bool IsTailPredicationLegal() const { | |||
165 | // For now, let's keep things really simple and only support a single | |||
166 | // block for tail predication. | |||
167 | return !Revert && FoundAllComponents() && VCTP && | |||
168 | !CannotTailPredicate && ML->getNumBlocks() == 1; | |||
169 | } | |||
170 | ||||
171 | bool ValidateTailPredicate(MachineInstr *StartInsertPt, | |||
172 | ReachingDefAnalysis *RDA, | |||
173 | MachineLoopInfo *MLI); | |||
174 | ||||
175 | // Is it safe to define LR with DLS/WLS? | |||
176 | // LR can be defined if it is the operand to start, because it's the same | |||
177 | // value, or if it's going to be equivalent to the operand to Start. | |||
178 | MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA); | |||
179 | ||||
180 | // Check the branch targets are within range and we satisfy our | |||
181 | // restrictions. | |||
182 | void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA, | |||
183 | MachineLoopInfo *MLI); | |||
184 | ||||
185 | bool FoundAllComponents() const { | |||
186 | return Start && Dec && End; | |||
187 | } | |||
188 | ||||
189 | SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; } | |||
190 | ||||
191 | // Return the loop iteration count, or the number of elements if we're tail | |||
192 | // predicating. | |||
193 | MachineOperand &getCount() { | |||
194 | return IsTailPredicationLegal() ? | |||
195 | VCTP->getOperand(1) : Start->getOperand(0); | |||
196 | } | |||
197 | ||||
198 | unsigned getStartOpcode() const { | |||
199 | bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; | |||
200 | if (!IsTailPredicationLegal()) | |||
201 | return IsDo ? ARM::t2DLS : ARM::t2WLS; | |||
202 | ||||
203 | return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo); | |||
204 | } | |||
205 | ||||
206 | void dump() const { | |||
207 | if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; | |||
208 | if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; | |||
209 | if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; | |||
210 | if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP; | |||
211 | if (!FoundAllComponents()) | |||
212 | dbgs() << "ARM Loops: Not a low-overhead loop.\n"; | |||
213 | else if (!(Start && Dec && End)) | |||
214 | dbgs() << "ARM Loops: Failed to find all loop components.\n"; | |||
215 | } | |||
216 | }; | |||
217 | ||||
218 | class ARMLowOverheadLoops : public MachineFunctionPass { | |||
219 | MachineFunction *MF = nullptr; | |||
220 | MachineLoopInfo *MLI = nullptr; | |||
221 | ReachingDefAnalysis *RDA = nullptr; | |||
222 | const ARMBaseInstrInfo *TII = nullptr; | |||
223 | MachineRegisterInfo *MRI = nullptr; | |||
224 | const TargetRegisterInfo *TRI = nullptr; | |||
225 | std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; | |||
226 | ||||
227 | public: | |||
228 | static char ID; | |||
229 | ||||
230 | ARMLowOverheadLoops() : MachineFunctionPass(ID) { } | |||
231 | ||||
232 | void getAnalysisUsage(AnalysisUsage &AU) const override { | |||
233 | AU.setPreservesCFG(); | |||
234 | AU.addRequired<MachineLoopInfo>(); | |||
235 | AU.addRequired<ReachingDefAnalysis>(); | |||
236 | MachineFunctionPass::getAnalysisUsage(AU); | |||
237 | } | |||
238 | ||||
239 | bool runOnMachineFunction(MachineFunction &MF) override; | |||
240 | ||||
241 | MachineFunctionProperties getRequiredProperties() const override { | |||
242 | return MachineFunctionProperties().set( | |||
243 | MachineFunctionProperties::Property::NoVRegs).set( | |||
244 | MachineFunctionProperties::Property::TracksLiveness); | |||
245 | } | |||
246 | ||||
247 | StringRef getPassName() const override { | |||
248 | return ARM_LOW_OVERHEAD_LOOPS_NAME"ARM Low Overhead Loops pass"; | |||
249 | } | |||
250 | ||||
251 | private: | |||
252 | bool ProcessLoop(MachineLoop *ML); | |||
253 | ||||
254 | bool RevertNonLoops(); | |||
255 | ||||
256 | void RevertWhile(MachineInstr *MI) const; | |||
257 | ||||
258 | bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const; | |||
259 | ||||
260 | void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; | |||
261 | ||||
262 | void RemoveLoopUpdate(LowOverheadLoop &LoLoop); | |||
263 | ||||
264 | void ConvertVPTBlocks(LowOverheadLoop &LoLoop); | |||
265 | ||||
266 | MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); | |||
267 | ||||
268 | void Expand(LowOverheadLoop &LoLoop); | |||
269 | ||||
270 | }; | |||
271 | } | |||
272 | ||||
273 | char ARMLowOverheadLoops::ID = 0; | |||
274 | ||||
275 | INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,static void *initializeARMLowOverheadLoopsPassOnce(PassRegistry &Registry) { PassInfo *PI = new PassInfo( "ARM Low Overhead Loops pass" , "arm-low-overhead-loops", &ARMLowOverheadLoops::ID, PassInfo ::NormalCtor_t(callDefaultCtor<ARMLowOverheadLoops>), false , false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeARMLowOverheadLoopsPassFlag; void llvm ::initializeARMLowOverheadLoopsPass(PassRegistry &Registry ) { llvm::call_once(InitializeARMLowOverheadLoopsPassFlag, initializeARMLowOverheadLoopsPassOnce , std::ref(Registry)); } | |||
276 | false, false)static void *initializeARMLowOverheadLoopsPassOnce(PassRegistry &Registry) { PassInfo *PI = new PassInfo( "ARM Low Overhead Loops pass" , "arm-low-overhead-loops", &ARMLowOverheadLoops::ID, PassInfo ::NormalCtor_t(callDefaultCtor<ARMLowOverheadLoops>), false , false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeARMLowOverheadLoopsPassFlag; void llvm ::initializeARMLowOverheadLoopsPass(PassRegistry &Registry ) { llvm::call_once(InitializeARMLowOverheadLoopsPassFlag, initializeARMLowOverheadLoopsPassOnce , std::ref(Registry)); } | |||
277 | ||||
278 | MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) { | |||
279 | // We can define LR because LR already contains the same value. | |||
280 | if (Start->getOperand(0).getReg() == ARM::LR) | |||
281 | return Start; | |||
282 | ||||
283 | unsigned CountReg = Start->getOperand(0).getReg(); | |||
284 | auto IsMoveLR = [&CountReg](MachineInstr *MI) { | |||
285 | return MI->getOpcode() == ARM::tMOVr && | |||
286 | MI->getOperand(0).getReg() == ARM::LR && | |||
287 | MI->getOperand(1).getReg() == CountReg && | |||
288 | MI->getOperand(2).getImm() == ARMCC::AL; | |||
289 | }; | |||
290 | ||||
291 | MachineBasicBlock *MBB = Start->getParent(); | |||
292 | ||||
293 | // Find an insertion point: | |||
294 | // - Is there a (mov lr, Count) before Start? If so, and nothing else writes | |||
295 | // to Count before Start, we can insert at that mov. | |||
296 | if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR)) | |||
297 | if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg)) | |||
298 | return LRDef; | |||
299 | ||||
300 | // - Is there a (mov lr, Count) after Start? If so, and nothing else writes | |||
301 | // to Count after Start, we can insert at that mov. | |||
302 | if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR)) | |||
303 | if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg)) | |||
304 | return LRDef; | |||
305 | ||||
306 | // We've found no suitable LR def and Start doesn't use LR directly. Can we | |||
307 | // just define LR anyway? | |||
308 | if (!RDA->isRegUsedAfter(Start, ARM::LR)) | |||
309 | return Start; | |||
310 | ||||
311 | return nullptr; | |||
312 | } | |||
313 | ||||
314 | // Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must | |||
315 | // not define a register that is used by any instructions, after and including, | |||
316 | // 'To'. These instructions also must not redefine any of Froms operands. | |||
317 | template<typename Iterator> | |||
318 | static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) { | |||
319 | SmallSet<int, 2> Defs; | |||
320 | // First check that From would compute the same value if moved. | |||
321 | for (auto &MO : From->operands()) { | |||
322 | if (!MO.isReg() || MO.isUndef() || !MO.getReg()) | |||
323 | continue; | |||
324 | if (MO.isDef()) | |||
325 | Defs.insert(MO.getReg()); | |||
326 | else if (!RDA->hasSameReachingDef(From, To, MO.getReg())) | |||
327 | return false; | |||
328 | } | |||
329 | ||||
330 | // Now walk checking that the rest of the instructions will compute the same | |||
331 | // value. | |||
332 | for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) { | |||
333 | for (auto &MO : I->operands()) | |||
334 | if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg())) | |||
335 | return false; | |||
336 | } | |||
337 | return true; | |||
338 | } | |||
339 | ||||
340 | bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, | |||
341 | ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) { | |||
342 | assert(VCTP && "VCTP instruction expected but is not set")((VCTP && "VCTP instruction expected but is not set") ? static_cast<void> (0) : __assert_fail ("VCTP && \"VCTP instruction expected but is not set\"" , "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp" , 342, __PRETTY_FUNCTION__)); | |||
343 | // All predication within the loop should be based on vctp. If the block | |||
344 | // isn't predicated on entry, check whether the vctp is within the block | |||
345 | // and that all other instructions are then predicated on it. | |||
346 | for (auto &Block : VPTBlocks) { | |||
347 | if (Block.IsPredicatedOn(VCTP)) | |||
348 | continue; | |||
349 | if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI)) | |||
350 | return false; | |||
351 | SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts(); | |||
352 | for (auto &PredMI : Insts) { | |||
353 | if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI)) | |||
354 | continue; | |||
355 | LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI << " - which is predicated on:\n"; for (auto *MI : PredMI.Predicates) dbgs() << " - " << *MI;; } } while (false) | |||
356 | << " - which is predicated on:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI << " - which is predicated on:\n"; for (auto *MI : PredMI.Predicates) dbgs() << " - " << *MI;; } } while (false) | |||
357 | for (auto *MI : PredMI.Predicates)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI << " - which is predicated on:\n"; for (auto *MI : PredMI.Predicates) dbgs() << " - " << *MI;; } } while (false) | |||
358 | dbgs() << " - " << *MI;do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI << " - which is predicated on:\n"; for (auto *MI : PredMI.Predicates) dbgs() << " - " << *MI;; } } while (false) | |||
359 | )do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI << " - which is predicated on:\n"; for (auto *MI : PredMI.Predicates) dbgs() << " - " << *MI;; } } while (false); | |||
360 | return false; | |||
361 | } | |||
362 | } | |||
363 | ||||
364 | // For tail predication, we need to provide the number of elements, instead | |||
365 | // of the iteration count, to the loop start instruction. The number of | |||
366 | // elements is provided to the vctp instruction, so we need to check that | |||
367 | // we can use this register at InsertPt. | |||
368 | Register NumElements = VCTP->getOperand(1).getReg(); | |||
369 | ||||
370 | // If the register is defined within loop, then we can't perform TP. | |||
371 | // TODO: Check whether this is just a mov of a register that would be | |||
372 | // available. | |||
373 | if (RDA->getReachingDef(VCTP, NumElements) >= 0) { | |||
374 | LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n" ; } } while (false); | |||
375 | return false; | |||
376 | } | |||
377 | ||||
378 | // The element count register maybe defined after InsertPt, in which case we | |||
379 | // need to try to move either InsertPt or the def so that the [w|d]lstp can | |||
380 | // use the value. | |||
381 | MachineBasicBlock *InsertBB = InsertPt->getParent(); | |||
382 | if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) { | |||
383 | if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) { | |||
384 | if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) { | |||
385 | ElemDef->removeFromParent(); | |||
386 | InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef); | |||
387 | LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Moved element count def: " << *ElemDef; } } while (false) | |||
388 | << *ElemDef)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Moved element count def: " << *ElemDef; } } while (false); | |||
389 | } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) { | |||
390 | InsertPt->removeFromParent(); | |||
391 | InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt); | |||
392 | LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Moved start past: " << *ElemDef; } } while (false); | |||
393 | } else | |||
394 | return false; | |||
395 | } | |||
396 | } | |||
397 | ||||
398 | // Especially in the case of while loops, InsertBB may not be the | |||
399 | // preheader, so we need to check that the register isn't redefined | |||
400 | // before entering the loop. | |||
401 | auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB, | |||
402 | Register NumElements) { | |||
403 | // NumElements is redefined in this block. | |||
404 | if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0) | |||
405 | return true; | |||
406 | ||||
407 | // Don't continue searching up through multiple predecessors. | |||
408 | if (MBB->pred_size() > 1) | |||
409 | return true; | |||
410 | ||||
411 | return false; | |||
412 | }; | |||
413 | ||||
414 | // First, find the block that looks like the preheader. | |||
415 | MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true); | |||
416 | if (!MBB) | |||
417 | return false; | |||
418 | ||||
419 | // Then search backwards for a def, until we get to InsertBB. | |||
420 | while (MBB != InsertBB) { | |||
421 | if (CannotProvideElements(MBB, NumElements)) | |||
422 | return false; | |||
423 | MBB = *MBB->pred_begin(); | |||
424 | } | |||
425 | ||||
426 | LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Will use tail predication.\n" ; } } while (false); | |||
427 | return true; | |||
428 | } | |||
429 | ||||
430 | void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils, | |||
431 | ReachingDefAnalysis *RDA, | |||
432 | MachineLoopInfo *MLI) { | |||
433 | if (Revert) | |||
434 | return; | |||
435 | ||||
436 | if (!End->getOperand(1).isMBB()) | |||
437 | report_fatal_error("Expected LoopEnd to target basic block"); | |||
438 | ||||
439 | // TODO Maybe there's cases where the target doesn't have to be the header, | |||
440 | // but for now be safe and revert. | |||
441 | if (End->getOperand(1).getMBB() != ML->getHeader()) { | |||
442 | LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: LoopEnd is not targetting header.\n" ; } } while (false); | |||
443 | Revert = true; | |||
444 | return; | |||
445 | } | |||
446 | ||||
447 | // The WLS and LE instructions have 12-bits for the label offset. WLS | |||
448 | // requires a positive offset, while LE uses negative. | |||
449 | if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || | |||
450 | !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { | |||
451 | LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: LE offset is out-of-range\n" ; } } while (false); | |||
452 | Revert = true; | |||
453 | return; | |||
454 | } | |||
455 | ||||
456 | if (Start->getOpcode() == ARM::t2WhileLoopStart && | |||
457 | (BBUtils->getOffsetOf(Start) > | |||
458 | BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || | |||
459 | !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { | |||
460 | LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: WLS offset is out-of-range!\n" ; } } while (false); | |||
461 | Revert = true; | |||
462 | return; | |||
463 | } | |||
464 | ||||
465 | InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA); | |||
466 | if (!InsertPt) { | |||
467 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Unable to find safe insertion point.\n" ; } } while (false); | |||
468 | Revert = true; | |||
469 | return; | |||
470 | } else | |||
471 | LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Start insertion point: " << *InsertPt; } } while (false); | |||
472 | ||||
473 | if (!IsTailPredicationLegal()) { | |||
474 | LLVM_DEBUG(if (!VCTP)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { if (!VCTP) dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n" ; dbgs() << "ARM Loops: Tail-predication is not valid.\n" ; } } while (false) | |||
475 | dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { if (!VCTP) dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n" ; dbgs() << "ARM Loops: Tail-predication is not valid.\n" ; } } while (false) | |||
476 | dbgs() << "ARM Loops: Tail-predication is not valid.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { if (!VCTP) dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n" ; dbgs() << "ARM Loops: Tail-predication is not valid.\n" ; } } while (false); | |||
477 | return; | |||
478 | } | |||
479 | ||||
480 | assert(ML->getBlocks().size() == 1 &&((ML->getBlocks().size() == 1 && "Shouldn't be processing a loop with more than one block" ) ? static_cast<void> (0) : __assert_fail ("ML->getBlocks().size() == 1 && \"Shouldn't be processing a loop with more than one block\"" , "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp" , 481, __PRETTY_FUNCTION__)) | |||
481 | "Shouldn't be processing a loop with more than one block")((ML->getBlocks().size() == 1 && "Shouldn't be processing a loop with more than one block" ) ? static_cast<void> (0) : __assert_fail ("ML->getBlocks().size() == 1 && \"Shouldn't be processing a loop with more than one block\"" , "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp" , 481, __PRETTY_FUNCTION__)); | |||
482 | CannotTailPredicate = !ValidateTailPredicate(InsertPt, RDA, MLI); | |||
483 | LLVM_DEBUG(if (CannotTailPredicate)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { if (CannotTailPredicate) dbgs() << "ARM Loops: Couldn't validate tail predicate.\n"; } } while (false) | |||
484 | dbgs() << "ARM Loops: Couldn't validate tail predicate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { if (CannotTailPredicate) dbgs() << "ARM Loops: Couldn't validate tail predicate.\n"; } } while (false); | |||
485 | } | |||
486 | ||||
487 | bool LowOverheadLoop::RecordVPTBlocks(MachineInstr* MI) { | |||
488 | // Only support a single vctp. | |||
489 | if (isVCTP(MI) && VCTP) | |||
490 | return false; | |||
491 | ||||
492 | // Start a new vpt block when we discover a vpt. | |||
493 | if (MI->getOpcode() == ARM::MVE_VPST) { | |||
494 | VPTBlocks.emplace_back(MI, CurrentPredicate); | |||
495 | CurrentBlock = &VPTBlocks.back(); | |||
496 | return true; | |||
497 | } | |||
498 | ||||
499 | if (isVCTP(MI)) | |||
500 | VCTP = MI; | |||
501 | ||||
502 | unsigned VPROpNum = MI->getNumOperands() - 1; | |||
503 | bool IsUse = false; | |||
504 | if (MI->getOperand(VPROpNum).isReg() && | |||
505 | MI->getOperand(VPROpNum).getReg() == ARM::VPR && | |||
506 | MI->getOperand(VPROpNum).isUse()) { | |||
507 | // If this instruction is predicated by VPR, it will be its last | |||
508 | // operand. Also check that it's only 'Then' predicated. | |||
509 | if (!MI->getOperand(VPROpNum-1).isImm() || | |||
510 | MI->getOperand(VPROpNum-1).getImm() != ARMVCC::Then) { | |||
511 | LLVM_DEBUG(dbgs() << "ARM Loops: Found unhandled predicate on: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Found unhandled predicate on: " << *MI; } } while (false) | |||
512 | << *MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Found unhandled predicate on: " << *MI; } } while (false); | |||
513 | return false; | |||
514 | } | |||
515 | CurrentBlock->addInst(MI, CurrentPredicate); | |||
516 | IsUse = true; | |||
517 | } | |||
518 | ||||
519 | bool IsDef = false; | |||
520 | for (unsigned i = 0; i < MI->getNumOperands() - 1; ++i) { | |||
521 | const MachineOperand &MO = MI->getOperand(i); | |||
522 | if (!MO.isReg() || MO.getReg() != ARM::VPR) | |||
523 | continue; | |||
524 | ||||
525 | if (MO.isDef()) { | |||
526 | CurrentPredicate.insert(MI); | |||
527 | IsDef = true; | |||
528 | } else { | |||
529 | LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Found instruction using vpr: " << *MI; } } while (false); | |||
530 | return false; | |||
531 | } | |||
532 | } | |||
533 | ||||
534 | // If we find a vpr def that is not already predicated on the vctp, we've | |||
535 | // got disjoint predicates that may not be equivalent when we do the | |||
536 | // conversion. | |||
537 | if (IsDef && !IsUse && VCTP && !isVCTP(MI)) { | |||
538 | LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI; } } while (false); | |||
539 | return false; | |||
540 | } | |||
541 | ||||
542 | return true; | |||
543 | } | |||
544 | ||||
545 | bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { | |||
546 | const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget()); | |||
547 | if (!ST.hasLOB()) | |||
548 | return false; | |||
549 | ||||
550 | MF = &mf; | |||
551 | LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"; } } while (false); | |||
552 | ||||
553 | MLI = &getAnalysis<MachineLoopInfo>(); | |||
554 | RDA = &getAnalysis<ReachingDefAnalysis>(); | |||
555 | MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); | |||
556 | MRI = &MF->getRegInfo(); | |||
557 | TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); | |||
558 | TRI = ST.getRegisterInfo(); | |||
559 | BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF)); | |||
560 | BBUtils->computeAllBlockSizes(); | |||
561 | BBUtils->adjustBBOffsetsAfter(&MF->front()); | |||
562 | ||||
563 | bool Changed = false; | |||
564 | for (auto ML : *MLI) { | |||
565 | if (!ML->getParentLoop()) | |||
566 | Changed |= ProcessLoop(ML); | |||
567 | } | |||
568 | Changed |= RevertNonLoops(); | |||
569 | return Changed; | |||
570 | } | |||
571 | ||||
572 | bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { | |||
573 | ||||
574 | bool Changed = false; | |||
575 | ||||
576 | // Process inner loops first. | |||
577 | for (auto I = ML->begin(), E = ML->end(); I != E; ++I) | |||
578 | Changed |= ProcessLoop(*I); | |||
579 | ||||
580 | LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Processing loop containing:\n" ; if (auto *Preheader = ML->getLoopPreheader()) dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB ->getName() << "\n";; } } while (false) | |||
581 | if (auto *Preheader = ML->getLoopPreheader())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Processing loop containing:\n" ; if (auto *Preheader = ML->getLoopPreheader()) dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB ->getName() << "\n";; } } while (false) | |||
582 | dbgs() << " - " << Preheader->getName() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Processing loop containing:\n" ; if (auto *Preheader = ML->getLoopPreheader()) dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB ->getName() << "\n";; } } while (false) | |||
583 | else if (auto *Preheader = MLI->findLoopPreheader(ML))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Processing loop containing:\n" ; if (auto *Preheader = ML->getLoopPreheader()) dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB ->getName() << "\n";; } } while (false) | |||
584 | dbgs() << " - " << Preheader->getName() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Processing loop containing:\n" ; if (auto *Preheader = ML->getLoopPreheader()) dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB ->getName() << "\n";; } } while (false) | |||
585 | for (auto *MBB : ML->getBlocks())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Processing loop containing:\n" ; if (auto *Preheader = ML->getLoopPreheader()) dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB ->getName() << "\n";; } } while (false) | |||
586 | dbgs() << " - " << MBB->getName() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Processing loop containing:\n" ; if (auto *Preheader = ML->getLoopPreheader()) dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB ->getName() << "\n";; } } while (false) | |||
587 | )do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Processing loop containing:\n" ; if (auto *Preheader = ML->getLoopPreheader()) dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB ->getName() << "\n";; } } while (false); | |||
588 | ||||
589 | // Search the given block for a loop start instruction. If one isn't found, | |||
590 | // and there's only one predecessor block, search that one too. | |||
591 | std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = | |||
592 | [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { | |||
593 | for (auto &MI : *MBB) { | |||
594 | if (isLoopStart(MI)) | |||
595 | return &MI; | |||
596 | } | |||
597 | if (MBB->pred_size() == 1) | |||
598 | return SearchForStart(*MBB->pred_begin()); | |||
599 | return nullptr; | |||
600 | }; | |||
601 | ||||
602 | LowOverheadLoop LoLoop(ML); | |||
603 | // Search the preheader for the start intrinsic. | |||
604 | // FIXME: I don't see why we shouldn't be supporting multiple predecessors | |||
605 | // with potentially multiple set.loop.iterations, so we need to enable this. | |||
606 | if (auto *Preheader = ML->getLoopPreheader()) | |||
607 | LoLoop.Start = SearchForStart(Preheader); | |||
608 | else if (auto *Preheader = MLI->findLoopPreheader(ML, true)) | |||
609 | LoLoop.Start = SearchForStart(Preheader); | |||
610 | else | |||
611 | return false; | |||
612 | ||||
613 | // Find the low-overhead loop components and decide whether or not to fall | |||
614 | // back to a normal loop. Also look for a vctp instructions and decide | |||
615 | // whether we can convert that predicate using tail predication. | |||
616 | for (auto *MBB : reverse(ML->getBlocks())) { | |||
617 | for (auto &MI : *MBB) { | |||
618 | if (MI.getOpcode() == ARM::t2LoopDec) | |||
619 | LoLoop.Dec = &MI; | |||
620 | else if (MI.getOpcode() == ARM::t2LoopEnd) | |||
621 | LoLoop.End = &MI; | |||
622 | else if (isLoopStart(MI)) | |||
623 | LoLoop.Start = &MI; | |||
624 | else if (MI.getDesc().isCall()) { | |||
625 | // TODO: Though the call will require LE to execute again, does this | |||
626 | // mean we should revert? Always executing LE hopefully should be | |||
627 | // faster than performing a sub,cmp,br or even subs,br. | |||
628 | LoLoop.Revert = true; | |||
629 | LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Found call.\n" ; } } while (false); | |||
630 | } else { | |||
631 | // Record VPR defs and build up their corresponding vpt blocks. | |||
632 | // Check we know how to tail predicate any mve instructions. | |||
633 | LoLoop.AnalyseMVEInst(&MI); | |||
634 | } | |||
635 | ||||
636 | // We need to ensure that LR is not used or defined inbetween LoopDec and | |||
637 | // LoopEnd. | |||
638 | if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert) | |||
639 | continue; | |||
640 | ||||
641 | // If we find that LR has been written or read between LoopDec and | |||
642 | // LoopEnd, expect that the decremented value is being used else where. | |||
643 | // Because this value isn't actually going to be produced until the | |||
644 | // latch, by LE, we would need to generate a real sub. The value is also | |||
645 | // likely to be copied/reloaded for use of LoopEnd - in which in case | |||
646 | // we'd need to perform an add because it gets subtracted again by LE! | |||
647 | // The other option is to then generate the other form of LE which doesn't | |||
648 | // perform the sub. | |||
649 | for (auto &MO : MI.operands()) { | |||
650 | if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() && | |||
651 | MO.getReg() == ARM::LR) { | |||
652 | LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Found LR Use/Def: " << MI; } } while (false); | |||
653 | LoLoop.Revert = true; | |||
654 | break; | |||
655 | } | |||
656 | } | |||
657 | } | |||
658 | } | |||
659 | ||||
660 | LLVM_DEBUG(LoLoop.dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { LoLoop.dump(); } } while (false ); | |||
661 | if (!LoLoop.FoundAllComponents()) { | |||
662 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Didn't find loop start, update, end\n" ; } } while (false); | |||
663 | return false; | |||
664 | } | |||
665 | ||||
666 | LoLoop.CheckLegality(BBUtils.get(), RDA, MLI); | |||
667 | Expand(LoLoop); | |||
668 | return true; | |||
669 | } | |||
670 | ||||
671 | // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a | |||
672 | // beq that branches to the exit branch. | |||
673 | // TODO: We could also try to generate a cbz if the value in LR is also in | |||
674 | // another low register. | |||
675 | void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { | |||
676 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Reverting to cmp: " << *MI; } } while (false); | |||
677 | MachineBasicBlock *MBB = MI->getParent(); | |||
678 | MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), | |||
679 | TII->get(ARM::t2CMPri)); | |||
680 | MIB.add(MI->getOperand(0)); | |||
681 | MIB.addImm(0); | |||
682 | MIB.addImm(ARMCC::AL); | |||
683 | MIB.addReg(ARM::NoRegister); | |||
684 | ||||
685 | MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); | |||
686 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? | |||
687 | ARM::tBcc : ARM::t2Bcc; | |||
688 | ||||
689 | MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); | |||
690 | MIB.add(MI->getOperand(1)); // branch target | |||
691 | MIB.addImm(ARMCC::EQ); // condition code | |||
692 | MIB.addReg(ARM::CPSR); | |||
693 | MI->eraseFromParent(); | |||
694 | } | |||
695 | ||||
696 | bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI, | |||
697 | bool SetFlags) const { | |||
698 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Reverting to sub: " << *MI; } } while (false); | |||
699 | MachineBasicBlock *MBB = MI->getParent(); | |||
700 | ||||
701 | // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. | |||
702 | if (SetFlags && | |||
703 | (RDA->isRegUsedAfter(MI, ARM::CPSR) || | |||
704 | !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR))) | |||
705 | SetFlags = false; | |||
706 | ||||
707 | MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), | |||
708 | TII->get(ARM::t2SUBri)); | |||
709 | MIB.addDef(ARM::LR); | |||
710 | MIB.add(MI->getOperand(1)); | |||
711 | MIB.add(MI->getOperand(2)); | |||
712 | MIB.addImm(ARMCC::AL); | |||
713 | MIB.addReg(0); | |||
714 | ||||
715 | if (SetFlags) { | |||
716 | MIB.addReg(ARM::CPSR); | |||
717 | MIB->getOperand(5).setIsDef(true); | |||
718 | } else | |||
719 | MIB.addReg(0); | |||
720 | ||||
721 | MI->eraseFromParent(); | |||
722 | return SetFlags; | |||
723 | } | |||
724 | ||||
725 | // Generate a subs, or sub and cmp, and a branch instead of an LE. | |||
726 | void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { | |||
727 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI; } } while (false); | |||
728 | ||||
729 | MachineBasicBlock *MBB = MI->getParent(); | |||
730 | // Create cmp | |||
731 | if (!SkipCmp) { | |||
732 | MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), | |||
733 | TII->get(ARM::t2CMPri)); | |||
734 | MIB.addReg(ARM::LR); | |||
735 | MIB.addImm(0); | |||
736 | MIB.addImm(ARMCC::AL); | |||
737 | MIB.addReg(ARM::NoRegister); | |||
738 | } | |||
739 | ||||
740 | MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); | |||
741 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? | |||
742 | ARM::tBcc : ARM::t2Bcc; | |||
743 | ||||
744 | // Create bne | |||
745 | MachineInstrBuilder MIB = | |||
746 | BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); | |||
747 | MIB.add(MI->getOperand(1)); // branch target | |||
748 | MIB.addImm(ARMCC::NE); // condition code | |||
749 | MIB.addReg(ARM::CPSR); | |||
750 | MI->eraseFromParent(); | |||
751 | } | |||
752 | ||||
753 | MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { | |||
754 | MachineInstr *InsertPt = LoLoop.InsertPt; | |||
755 | MachineInstr *Start = LoLoop.Start; | |||
756 | MachineBasicBlock *MBB = InsertPt->getParent(); | |||
757 | bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; | |||
758 | unsigned Opc = LoLoop.getStartOpcode(); | |||
759 | MachineOperand &Count = LoLoop.getCount(); | |||
760 | ||||
761 | MachineInstrBuilder MIB = | |||
762 | BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); | |||
763 | ||||
764 | MIB.addDef(ARM::LR); | |||
765 | MIB.add(Count); | |||
766 | if (!IsDo) | |||
767 | MIB.add(Start->getOperand(1)); | |||
768 | ||||
769 | // When using tail-predication, try to delete the dead code that was used to | |||
770 | // calculate the number of loop iterations. | |||
771 | if (LoLoop.IsTailPredicationLegal()) { | |||
772 | SmallVector<MachineInstr*, 4> Killed; | |||
773 | SmallVector<MachineInstr*, 4> Dead; | |||
774 | if (auto *Def = RDA->getReachingMIDef(Start, | |||
775 | Start->getOperand(0).getReg())) { | |||
776 | Killed.push_back(Def); | |||
777 | ||||
778 | while (!Killed.empty()) { | |||
779 | MachineInstr *Def = Killed.back(); | |||
780 | Killed.pop_back(); | |||
781 | Dead.push_back(Def); | |||
782 | for (auto &MO : Def->operands()) { | |||
783 | if (!MO.isReg() || !MO.isKill()) | |||
784 | continue; | |||
785 | ||||
786 | MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg()); | |||
787 | if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1) | |||
788 | Killed.push_back(Kill); | |||
789 | } | |||
790 | } | |||
791 | for (auto *MI : Dead) | |||
792 | MI->eraseFromParent(); | |||
793 | } | |||
794 | } | |||
795 | ||||
796 | // If we're inserting at a mov lr, then remove it as it's redundant. | |||
797 | if (InsertPt != Start) | |||
798 | InsertPt->eraseFromParent(); | |||
799 | Start->eraseFromParent(); | |||
800 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Inserted start: " << *MIB; } } while (false); | |||
801 | return &*MIB; | |||
802 | } | |||
803 | ||||
804 | // Goal is to optimise and clean-up these loops: | |||
805 | // | |||
806 | // vector.body: | |||
807 | // renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg | |||
808 | // renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4 | |||
809 | // .. | |||
810 | // $lr = MVE_DLSTP_32 renamable $r3 | |||
811 | // | |||
812 | // The SUB is the old update of the loop iteration count expression, which | |||
813 | // is no longer needed. This sub is removed when the element count, which is in | |||
814 | // r3 in this example, is defined by an instruction in the loop, and it has | |||
815 | // no uses. | |||
816 | // | |||
817 | void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) { | |||
818 | Register ElemCount = LoLoop.VCTP->getOperand(1).getReg(); | |||
819 | MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back(); | |||
820 | ||||
821 | LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Trying to remove loop update stmt\n" ; } } while (false); | |||
822 | ||||
823 | if (LoLoop.ML->getNumBlocks() != 1) { | |||
824 | LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Single block loop expected\n" ; } } while (false); | |||
825 | return; | |||
826 | } | |||
827 | ||||
828 | LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Analyzing elemcount in operand: " ; LoLoop.VCTP->getOperand(1).dump(); } } while (false) | |||
829 | LoLoop.VCTP->getOperand(1).dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Analyzing elemcount in operand: " ; LoLoop.VCTP->getOperand(1).dump(); } } while (false); | |||
830 | ||||
831 | // Find the definition we are interested in removing, if there is one. | |||
832 | MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount); | |||
833 | if (!Def) { | |||
834 | LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Can't find a def, nothing to do.\n" ; } } while (false); | |||
835 | return; | |||
836 | } | |||
837 | ||||
838 | // Bail if we define CPSR and it is not dead | |||
839 | if (!Def->registerDefIsDead(ARM::CPSR, TRI)) { | |||
840 | LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: CPSR is not dead\n" ; } } while (false); | |||
841 | return; | |||
842 | } | |||
843 | ||||
844 | // Bail if elemcount is used in exit blocks, i.e. if it is live-in. | |||
845 | if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) { | |||
846 | LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n" ; } } while (false); | |||
847 | return; | |||
848 | } | |||
849 | ||||
850 | // Bail if there are uses after this Def in the block. | |||
851 | SmallVector<MachineInstr*, 4> Uses; | |||
852 | RDA->getReachingLocalUses(Def, ElemCount, Uses); | |||
853 | if (Uses.size()) { | |||
854 | LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n" ; } } while (false); | |||
855 | return; | |||
856 | } | |||
857 | ||||
858 | Uses.clear(); | |||
859 | RDA->getAllInstWithUseBefore(Def, ElemCount, Uses); | |||
860 | ||||
861 | // Remove Def if there are no uses, or if the only use is the VCTP | |||
862 | // instruction. | |||
863 | if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) { | |||
864 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Removing loop update instruction: " ; Def->dump(); } } while (false) | |||
865 | Def->dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Removing loop update instruction: " ; Def->dump(); } } while (false); | |||
866 | Def->eraseFromParent(); | |||
867 | } | |||
868 | ||||
869 | LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n" ; for (auto U : Uses) U->dump(); } } while (false) | |||
870 | for (auto U : Uses) U->dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n" ; for (auto U : Uses) U->dump(); } } while (false); | |||
871 | } | |||
872 | ||||
873 | void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { | |||
874 | auto RemovePredicate = [](MachineInstr *MI) { | |||
875 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Removing predicate from: " << *MI; } } while (false); | |||
876 | unsigned OpNum = MI->getNumOperands() - 1; | |||
877 | assert(MI->getOperand(OpNum-1).getImm() == ARMVCC::Then &&((MI->getOperand(OpNum-1).getImm() == ARMVCC::Then && "Expected Then predicate!") ? static_cast<void> (0) : __assert_fail ("MI->getOperand(OpNum-1).getImm() == ARMVCC::Then && \"Expected Then predicate!\"" , "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp" , 878, __PRETTY_FUNCTION__)) | |||
878 | "Expected Then predicate!")((MI->getOperand(OpNum-1).getImm() == ARMVCC::Then && "Expected Then predicate!") ? static_cast<void> (0) : __assert_fail ("MI->getOperand(OpNum-1).getImm() == ARMVCC::Then && \"Expected Then predicate!\"" , "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp" , 878, __PRETTY_FUNCTION__)); | |||
879 | MI->getOperand(OpNum-1).setImm(ARMVCC::None); | |||
880 | MI->getOperand(OpNum).setReg(0); | |||
881 | }; | |||
882 | ||||
883 | // There are a few scenarios which we have to fix up: | |||
884 | // 1) A VPT block with is only predicated by the vctp and has no internal vpr | |||
885 | // defs. | |||
886 | // 2) A VPT block which is only predicated by the vctp but has an internal | |||
887 | // vpr def. | |||
888 | // 3) A VPT block which is predicated upon the vctp as well as another vpr | |||
889 | // def. | |||
890 | // 4) A VPT block which is not predicated upon a vctp, but contains it and | |||
891 | // all instructions within the block are predicated upon in. | |||
892 | ||||
893 | for (auto &Block : LoLoop.getVPTBlocks()) { | |||
| ||||
894 | SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts(); | |||
895 | if (Block.HasNonUniformPredicate()) { | |||
896 | PredicatedMI *Divergent = Block.getDivergent(); | |||
897 | if (isVCTP(Divergent->MI)) { | |||
898 | // The vctp will be removed, so the size of the vpt block needs to be | |||
899 | // modified. | |||
900 | uint64_t Size = getARMVPTBlockMask(Block.size() - 1); | |||
901 | Block.getVPST()->getOperand(0).setImm(Size); | |||
902 | LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Modified VPT block mask.\n" ; } } while (false); | |||
903 | } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { | |||
904 | // The VPT block has a non-uniform predicate but it's entry is guarded | |||
905 | // only by a vctp, which means we: | |||
906 | // - Need to remove the original vpst. | |||
907 | // - Then need to unpredicate any following instructions, until | |||
908 | // we come across the divergent vpr def. | |||
909 | // - Insert a new vpst to predicate the instruction(s) that following | |||
910 | // the divergent vpr def. | |||
911 | // TODO: We could be producing more VPT blocks than necessary and could | |||
912 | // fold the newly created one into a proceeding one. | |||
913 | for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()), | |||
914 | E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I) | |||
915 | RemovePredicate(&*I); | |||
916 | ||||
917 | unsigned Size = 0; | |||
918 | auto E = MachineBasicBlock::reverse_iterator(Divergent->MI); | |||
919 | auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI); | |||
920 | MachineInstr *InsertAt = nullptr; | |||
921 | while (I != E) { | |||
922 | InsertAt = &*I; | |||
923 | ++Size; | |||
924 | ++I; | |||
925 | } | |||
926 | MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt, | |||
| ||||
927 | InsertAt->getDebugLoc(), | |||
928 | TII->get(ARM::MVE_VPST)); | |||
929 | MIB.addImm(getARMVPTBlockMask(Size)); | |||
930 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST(); } } while (false); | |||
931 | LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Created VPST: " << *MIB; } } while (false); | |||
932 | Block.getVPST()->eraseFromParent(); | |||
933 | } | |||
934 | } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { | |||
935 | // A vpt block which is only predicated upon vctp and has no internal vpr | |||
936 | // defs: | |||
937 | // - Remove vpst. | |||
938 | // - Unpredicate the remaining instructions. | |||
939 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST(); } } while (false); | |||
940 | Block.getVPST()->eraseFromParent(); | |||
941 | for (auto &PredMI : Insts) | |||
942 | RemovePredicate(PredMI.MI); | |||
943 | } | |||
944 | } | |||
945 | ||||
946 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP; } } while (false); | |||
947 | LoLoop.VCTP->eraseFromParent(); | |||
948 | } | |||
949 | ||||
950 | void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { | |||
951 | ||||
952 | // Combine the LoopDec and LoopEnd instructions into LE(TP). | |||
953 | auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) { | |||
954 | MachineInstr *End = LoLoop.End; | |||
955 | MachineBasicBlock *MBB = End->getParent(); | |||
956 | unsigned Opc = LoLoop.IsTailPredicationLegal() ? | |||
957 | ARM::MVE_LETP : ARM::t2LEUpdate; | |||
958 | MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), | |||
959 | TII->get(Opc)); | |||
960 | MIB.addDef(ARM::LR); | |||
961 | MIB.add(End->getOperand(0)); | |||
962 | MIB.add(End->getOperand(1)); | |||
963 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Inserted LE: " << *MIB; } } while (false); | |||
964 | ||||
965 | LoLoop.End->eraseFromParent(); | |||
966 | LoLoop.Dec->eraseFromParent(); | |||
967 | return &*MIB; | |||
968 | }; | |||
969 | ||||
970 | // TODO: We should be able to automatically remove these branches before we | |||
971 | // get here - probably by teaching analyzeBranch about the pseudo | |||
972 | // instructions. | |||
973 | // If there is an unconditional branch, after I, that just branches to the | |||
974 | // next block, remove it. | |||
975 | auto RemoveDeadBranch = [](MachineInstr *I) { | |||
976 | MachineBasicBlock *BB = I->getParent(); | |||
977 | MachineInstr *Terminator = &BB->instr_back(); | |||
978 | if (Terminator->isUnconditionalBranch() && I != Terminator) { | |||
979 | MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB(); | |||
980 | if (BB->isLayoutSuccessor(Succ)) { | |||
981 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Removing branch: " << *Terminator; } } while (false); | |||
982 | Terminator->eraseFromParent(); | |||
983 | } | |||
984 | } | |||
985 | }; | |||
986 | ||||
987 | if (LoLoop.Revert) { | |||
988 | if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart) | |||
989 | RevertWhile(LoLoop.Start); | |||
990 | else | |||
991 | LoLoop.Start->eraseFromParent(); | |||
992 | bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true); | |||
993 | RevertLoopEnd(LoLoop.End, FlagsAlreadySet); | |||
994 | } else { | |||
995 | LoLoop.Start = ExpandLoopStart(LoLoop); | |||
996 | RemoveDeadBranch(LoLoop.Start); | |||
997 | LoLoop.End = ExpandLoopEnd(LoLoop); | |||
998 | RemoveDeadBranch(LoLoop.End); | |||
999 | if (LoLoop.IsTailPredicationLegal()) { | |||
1000 | RemoveLoopUpdate(LoLoop); | |||
1001 | ConvertVPTBlocks(LoLoop); | |||
1002 | } | |||
1003 | } | |||
1004 | } | |||
1005 | ||||
1006 | bool ARMLowOverheadLoops::RevertNonLoops() { | |||
1007 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("arm-low-overhead-loops")) { dbgs() << "ARM Loops: Reverting any remaining pseudos...\n" ; } } while (false); | |||
1008 | bool Changed = false; | |||
1009 | ||||
1010 | for (auto &MBB : *MF) { | |||
1011 | SmallVector<MachineInstr*, 4> Starts; | |||
1012 | SmallVector<MachineInstr*, 4> Decs; | |||
1013 | SmallVector<MachineInstr*, 4> Ends; | |||
1014 | ||||
1015 | for (auto &I : MBB) { | |||
1016 | if (isLoopStart(I)) | |||
1017 | Starts.push_back(&I); | |||
1018 | else if (I.getOpcode() == ARM::t2LoopDec) | |||
1019 | Decs.push_back(&I); | |||
1020 | else if (I.getOpcode() == ARM::t2LoopEnd) | |||
1021 | Ends.push_back(&I); | |||
1022 | } | |||
1023 | ||||
1024 | if (Starts.empty() && Decs.empty() && Ends.empty()) | |||
1025 | continue; | |||
1026 | ||||
1027 | Changed = true; | |||
1028 | ||||
1029 | for (auto *Start : Starts) { | |||
1030 | if (Start->getOpcode() == ARM::t2WhileLoopStart) | |||
1031 | RevertWhile(Start); | |||
1032 | else | |||
1033 | Start->eraseFromParent(); | |||
1034 | } | |||
1035 | for (auto *Dec : Decs) | |||
1036 | RevertLoopDec(Dec); | |||
1037 | ||||
1038 | for (auto *End : Ends) | |||
1039 | RevertLoopEnd(End); | |||
1040 | } | |||
1041 | return Changed; | |||
1042 | } | |||
1043 | ||||
1044 | FunctionPass *llvm::createARMLowOverheadLoopsPass() { | |||
1045 | return new ARMLowOverheadLoops(); | |||
1046 | } |
1 | //===-- ARMBaseInstrInfo.h - ARM Base Instruction Information ---*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file contains the Base ARM implementation of the TargetInstrInfo class. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H |
14 | #define LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H |
15 | |
16 | #include "MCTargetDesc/ARMBaseInfo.h" |
17 | #include "llvm/ADT/DenseMap.h" |
18 | #include "llvm/ADT/SmallSet.h" |
19 | #include "llvm/CodeGen/MachineBasicBlock.h" |
20 | #include "llvm/CodeGen/MachineInstr.h" |
21 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
22 | #include "llvm/CodeGen/MachineOperand.h" |
23 | #include "llvm/CodeGen/TargetInstrInfo.h" |
24 | #include <array> |
25 | #include <cstdint> |
26 | |
27 | #define GET_INSTRINFO_HEADER |
28 | #include "ARMGenInstrInfo.inc" |
29 | |
30 | namespace llvm { |
31 | |
32 | class ARMBaseRegisterInfo; |
33 | class ARMSubtarget; |
34 | |
35 | class ARMBaseInstrInfo : public ARMGenInstrInfo { |
36 | const ARMSubtarget &Subtarget; |
37 | |
38 | protected: |
39 | // Can be only subclassed. |
40 | explicit ARMBaseInstrInfo(const ARMSubtarget &STI); |
41 | |
42 | void expandLoadStackGuardBase(MachineBasicBlock::iterator MI, |
43 | unsigned LoadImmOpc, unsigned LoadOpc) const; |
44 | |
45 | /// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI |
46 | /// and \p DefIdx. |
47 | /// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of |
48 | /// the list is modeled as <Reg:SubReg, SubIdx>. |
49 | /// E.g., REG_SEQUENCE %1:sub1, sub0, %2, sub1 would produce |
50 | /// two elements: |
51 | /// - %1:sub1, sub0 |
52 | /// - %2<:0>, sub1 |
53 | /// |
54 | /// \returns true if it is possible to build such an input sequence |
55 | /// with the pair \p MI, \p DefIdx. False otherwise. |
56 | /// |
57 | /// \pre MI.isRegSequenceLike(). |
58 | bool getRegSequenceLikeInputs( |
59 | const MachineInstr &MI, unsigned DefIdx, |
60 | SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const override; |
61 | |
62 | /// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI |
63 | /// and \p DefIdx. |
64 | /// \p [out] InputReg of the equivalent EXTRACT_SUBREG. |
65 | /// E.g., EXTRACT_SUBREG %1:sub1, sub0, sub1 would produce: |
66 | /// - %1:sub1, sub0 |
67 | /// |
68 | /// \returns true if it is possible to build such an input sequence |
69 | /// with the pair \p MI, \p DefIdx. False otherwise. |
70 | /// |
71 | /// \pre MI.isExtractSubregLike(). |
72 | bool getExtractSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx, |
73 | RegSubRegPairAndIdx &InputReg) const override; |
74 | |
75 | /// Build the equivalent inputs of a INSERT_SUBREG for the given \p MI |
76 | /// and \p DefIdx. |
77 | /// \p [out] BaseReg and \p [out] InsertedReg contain |
78 | /// the equivalent inputs of INSERT_SUBREG. |
79 | /// E.g., INSERT_SUBREG %0:sub0, %1:sub1, sub3 would produce: |
80 | /// - BaseReg: %0:sub0 |
81 | /// - InsertedReg: %1:sub1, sub3 |
82 | /// |
83 | /// \returns true if it is possible to build such an input sequence |
84 | /// with the pair \p MI, \p DefIdx. False otherwise. |
85 | /// |
86 | /// \pre MI.isInsertSubregLike(). |
87 | bool |
88 | getInsertSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx, |
89 | RegSubRegPair &BaseReg, |
90 | RegSubRegPairAndIdx &InsertedReg) const override; |
91 | |
92 | /// Commutes the operands in the given instruction. |
93 | /// The commutable operands are specified by their indices OpIdx1 and OpIdx2. |
94 | /// |
95 | /// Do not call this method for a non-commutable instruction or for |
96 | /// non-commutable pair of operand indices OpIdx1 and OpIdx2. |
97 | /// Even though the instruction is commutable, the method may still |
98 | /// fail to commute the operands, null pointer is returned in such cases. |
99 | MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, |
100 | unsigned OpIdx1, |
101 | unsigned OpIdx2) const override; |
102 | /// If the specific machine instruction is an instruction that moves/copies |
103 | /// value from one register to another register return destination and source |
104 | /// registers as machine operands. |
105 | Optional<DestSourcePair> |
106 | isCopyInstrImpl(const MachineInstr &MI) const override; |
107 | |
108 | public: |
109 | // Return whether the target has an explicit NOP encoding. |
110 | bool hasNOP() const; |
111 | |
112 | // Return the non-pre/post incrementing version of 'Opc'. Return 0 |
113 | // if there is not such an opcode. |
114 | virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0; |
115 | |
116 | MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, |
117 | MachineInstr &MI, |
118 | LiveVariables *LV) const override; |
119 | |
120 | virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0; |
121 | const ARMSubtarget &getSubtarget() const { return Subtarget; } |
122 | |
123 | ScheduleHazardRecognizer * |
124 | CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, |
125 | const ScheduleDAG *DAG) const override; |
126 | |
127 | ScheduleHazardRecognizer * |
128 | CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, |
129 | const ScheduleDAG *DAG) const override; |
130 | |
131 | // Branch analysis. |
132 | bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, |
133 | MachineBasicBlock *&FBB, |
134 | SmallVectorImpl<MachineOperand> &Cond, |
135 | bool AllowModify = false) const override; |
136 | unsigned removeBranch(MachineBasicBlock &MBB, |
137 | int *BytesRemoved = nullptr) const override; |
138 | unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, |
139 | MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, |
140 | const DebugLoc &DL, |
141 | int *BytesAdded = nullptr) const override; |
142 | |
143 | bool |
144 | reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; |
145 | |
146 | // Predication support. |
147 | bool isPredicated(const MachineInstr &MI) const override; |
148 | |
149 | ARMCC::CondCodes getPredicate(const MachineInstr &MI) const { |
150 | int PIdx = MI.findFirstPredOperandIdx(); |
151 | return PIdx != -1 ? (ARMCC::CondCodes)MI.getOperand(PIdx).getImm() |
152 | : ARMCC::AL; |
153 | } |
154 | |
155 | bool PredicateInstruction(MachineInstr &MI, |
156 | ArrayRef<MachineOperand> Pred) const override; |
157 | |
158 | bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, |
159 | ArrayRef<MachineOperand> Pred2) const override; |
160 | |
161 | bool DefinesPredicate(MachineInstr &MI, |
162 | std::vector<MachineOperand> &Pred) const override; |
163 | |
164 | bool isPredicable(const MachineInstr &MI) const override; |
165 | |
166 | // CPSR defined in instruction |
167 | static bool isCPSRDefined(const MachineInstr &MI); |
168 | bool isAddrMode3OpImm(const MachineInstr &MI, unsigned Op) const; |
169 | bool isAddrMode3OpMinusReg(const MachineInstr &MI, unsigned Op) const; |
170 | |
171 | // Load, scaled register offset |
172 | bool isLdstScaledReg(const MachineInstr &MI, unsigned Op) const; |
173 | // Load, scaled register offset, not plus LSL2 |
174 | bool isLdstScaledRegNotPlusLsl2(const MachineInstr &MI, unsigned Op) const; |
175 | // Minus reg for ldstso addr mode |
176 | bool isLdstSoMinusReg(const MachineInstr &MI, unsigned Op) const; |
177 | // Scaled register offset in address mode 2 |
178 | bool isAm2ScaledReg(const MachineInstr &MI, unsigned Op) const; |
179 | // Load multiple, base reg in list |
180 | bool isLDMBaseRegInList(const MachineInstr &MI) const; |
181 | // get LDM variable defs size |
182 | unsigned getLDMVariableDefsSize(const MachineInstr &MI) const; |
183 | |
184 | /// GetInstSize - Returns the size of the specified MachineInstr. |
185 | /// |
186 | unsigned getInstSizeInBytes(const MachineInstr &MI) const override; |
187 | |
188 | unsigned isLoadFromStackSlot(const MachineInstr &MI, |
189 | int &FrameIndex) const override; |
190 | unsigned isStoreToStackSlot(const MachineInstr &MI, |
191 | int &FrameIndex) const override; |
192 | unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI, |
193 | int &FrameIndex) const override; |
194 | unsigned isStoreToStackSlotPostFE(const MachineInstr &MI, |
195 | int &FrameIndex) const override; |
196 | |
197 | void copyToCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
198 | unsigned SrcReg, bool KillSrc, |
199 | const ARMSubtarget &Subtarget) const; |
200 | void copyFromCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
201 | unsigned DestReg, bool KillSrc, |
202 | const ARMSubtarget &Subtarget) const; |
203 | |
204 | void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
205 | const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, |
206 | bool KillSrc) const override; |
207 | |
208 | void storeRegToStackSlot(MachineBasicBlock &MBB, |
209 | MachineBasicBlock::iterator MBBI, |
210 | unsigned SrcReg, bool isKill, int FrameIndex, |
211 | const TargetRegisterClass *RC, |
212 | const TargetRegisterInfo *TRI) const override; |
213 | |
214 | void loadRegFromStackSlot(MachineBasicBlock &MBB, |
215 | MachineBasicBlock::iterator MBBI, |
216 | unsigned DestReg, int FrameIndex, |
217 | const TargetRegisterClass *RC, |
218 | const TargetRegisterInfo *TRI) const override; |
219 | |
220 | bool expandPostRAPseudo(MachineInstr &MI) const override; |
221 | |
222 | bool shouldSink(const MachineInstr &MI) const override; |
223 | |
224 | void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, |
225 | unsigned DestReg, unsigned SubIdx, |
226 | const MachineInstr &Orig, |
227 | const TargetRegisterInfo &TRI) const override; |
228 | |
229 | MachineInstr & |
230 | duplicate(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, |
231 | const MachineInstr &Orig) const override; |
232 | |
233 | const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg, |
234 | unsigned SubIdx, unsigned State, |
235 | const TargetRegisterInfo *TRI) const; |
236 | |
237 | bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1, |
238 | const MachineRegisterInfo *MRI) const override; |
239 | |
240 | /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to |
241 | /// determine if two loads are loading from the same base address. It should |
242 | /// only return true if the base pointers are the same and the only |
243 | /// differences between the two addresses is the offset. It also returns the |
244 | /// offsets by reference. |
245 | bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, |
246 | int64_t &Offset2) const override; |
247 | |
248 | /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to |
249 | /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads |
250 | /// should be scheduled togther. On some targets if two loads are loading from |
251 | /// addresses in the same cache line, it's better if they are scheduled |
252 | /// together. This function takes two integers that represent the load offsets |
253 | /// from the common base address. It returns true if it decides it's desirable |
254 | /// to schedule the two loads together. "NumLoads" is the number of loads that |
255 | /// have already been scheduled after Load1. |
256 | bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, |
257 | int64_t Offset1, int64_t Offset2, |
258 | unsigned NumLoads) const override; |
259 | |
260 | bool isSchedulingBoundary(const MachineInstr &MI, |
261 | const MachineBasicBlock *MBB, |
262 | const MachineFunction &MF) const override; |
263 | |
264 | bool isProfitableToIfCvt(MachineBasicBlock &MBB, |
265 | unsigned NumCycles, unsigned ExtraPredCycles, |
266 | BranchProbability Probability) const override; |
267 | |
268 | bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, |
269 | unsigned ExtraT, MachineBasicBlock &FMBB, |
270 | unsigned NumF, unsigned ExtraF, |
271 | BranchProbability Probability) const override; |
272 | |
273 | bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, |
274 | BranchProbability Probability) const override { |
275 | return NumCycles == 1; |
276 | } |
277 | |
278 | unsigned extraSizeToPredicateInstructions(const MachineFunction &MF, |
279 | unsigned NumInsts) const override; |
280 | unsigned predictBranchSizeForIfCvt(MachineInstr &MI) const override; |
281 | |
282 | bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, |
283 | MachineBasicBlock &FMBB) const override; |
284 | |
285 | /// analyzeCompare - For a comparison instruction, return the source registers |
286 | /// in SrcReg and SrcReg2 if having two register operands, and the value it |
287 | /// compares against in CmpValue. Return true if the comparison instruction |
288 | /// can be analyzed. |
289 | bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, |
290 | unsigned &SrcReg2, int &CmpMask, |
291 | int &CmpValue) const override; |
292 | |
293 | /// optimizeCompareInstr - Convert the instruction to set the zero flag so |
294 | /// that we can remove a "comparison with zero"; Remove a redundant CMP |
295 | /// instruction if the flags can be updated in the same way by an earlier |
296 | /// instruction such as SUB. |
297 | bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, |
298 | unsigned SrcReg2, int CmpMask, int CmpValue, |
299 | const MachineRegisterInfo *MRI) const override; |
300 | |
301 | bool analyzeSelect(const MachineInstr &MI, |
302 | SmallVectorImpl<MachineOperand> &Cond, unsigned &TrueOp, |
303 | unsigned &FalseOp, bool &Optimizable) const override; |
304 | |
305 | MachineInstr *optimizeSelect(MachineInstr &MI, |
306 | SmallPtrSetImpl<MachineInstr *> &SeenMIs, |
307 | bool) const override; |
308 | |
309 | /// FoldImmediate - 'Reg' is known to be defined by a move immediate |
310 | /// instruction, try to fold the immediate into the use instruction. |
311 | bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, |
312 | MachineRegisterInfo *MRI) const override; |
313 | |
314 | unsigned getNumMicroOps(const InstrItineraryData *ItinData, |
315 | const MachineInstr &MI) const override; |
316 | |
317 | int getOperandLatency(const InstrItineraryData *ItinData, |
318 | const MachineInstr &DefMI, unsigned DefIdx, |
319 | const MachineInstr &UseMI, |
320 | unsigned UseIdx) const override; |
321 | int getOperandLatency(const InstrItineraryData *ItinData, |
322 | SDNode *DefNode, unsigned DefIdx, |
323 | SDNode *UseNode, unsigned UseIdx) const override; |
324 | |
325 | /// VFP/NEON execution domains. |
326 | std::pair<uint16_t, uint16_t> |
327 | getExecutionDomain(const MachineInstr &MI) const override; |
328 | void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override; |
329 | |
330 | unsigned |
331 | getPartialRegUpdateClearance(const MachineInstr &, unsigned, |
332 | const TargetRegisterInfo *) const override; |
333 | void breakPartialRegDependency(MachineInstr &, unsigned, |
334 | const TargetRegisterInfo *TRI) const override; |
335 | |
336 | /// Get the number of addresses by LDM or VLDM or zero for unknown. |
337 | unsigned getNumLDMAddresses(const MachineInstr &MI) const; |
338 | |
339 | std::pair<unsigned, unsigned> |
340 | decomposeMachineOperandsTargetFlags(unsigned TF) const override; |
341 | ArrayRef<std::pair<unsigned, const char *>> |
342 | getSerializableDirectMachineOperandTargetFlags() const override; |
343 | ArrayRef<std::pair<unsigned, const char *>> |
344 | getSerializableBitmaskMachineOperandTargetFlags() const override; |
345 | |
346 | private: |
347 | unsigned getInstBundleLength(const MachineInstr &MI) const; |
348 | |
349 | int getVLDMDefCycle(const InstrItineraryData *ItinData, |
350 | const MCInstrDesc &DefMCID, |
351 | unsigned DefClass, |
352 | unsigned DefIdx, unsigned DefAlign) const; |
353 | int getLDMDefCycle(const InstrItineraryData *ItinData, |
354 | const MCInstrDesc &DefMCID, |
355 | unsigned DefClass, |
356 | unsigned DefIdx, unsigned DefAlign) const; |
357 | int getVSTMUseCycle(const InstrItineraryData *ItinData, |
358 | const MCInstrDesc &UseMCID, |
359 | unsigned UseClass, |
360 | unsigned UseIdx, unsigned UseAlign) const; |
361 | int getSTMUseCycle(const InstrItineraryData *ItinData, |
362 | const MCInstrDesc &UseMCID, |
363 | unsigned UseClass, |
364 | unsigned UseIdx, unsigned UseAlign) const; |
365 | int getOperandLatency(const InstrItineraryData *ItinData, |
366 | const MCInstrDesc &DefMCID, |
367 | unsigned DefIdx, unsigned DefAlign, |
368 | const MCInstrDesc &UseMCID, |
369 | unsigned UseIdx, unsigned UseAlign) const; |
370 | |
371 | int getOperandLatencyImpl(const InstrItineraryData *ItinData, |
372 | const MachineInstr &DefMI, unsigned DefIdx, |
373 | const MCInstrDesc &DefMCID, unsigned DefAdj, |
374 | const MachineOperand &DefMO, unsigned Reg, |
375 | const MachineInstr &UseMI, unsigned UseIdx, |
376 | const MCInstrDesc &UseMCID, unsigned UseAdj) const; |
377 | |
378 | unsigned getPredicationCost(const MachineInstr &MI) const override; |
379 | |
380 | unsigned getInstrLatency(const InstrItineraryData *ItinData, |
381 | const MachineInstr &MI, |
382 | unsigned *PredCost = nullptr) const override; |
383 | |
384 | int getInstrLatency(const InstrItineraryData *ItinData, |
385 | SDNode *Node) const override; |
386 | |
387 | bool hasHighOperandLatency(const TargetSchedModel &SchedModel, |
388 | const MachineRegisterInfo *MRI, |
389 | const MachineInstr &DefMI, unsigned DefIdx, |
390 | const MachineInstr &UseMI, |
391 | unsigned UseIdx) const override; |
392 | bool hasLowDefLatency(const TargetSchedModel &SchedModel, |
393 | const MachineInstr &DefMI, |
394 | unsigned DefIdx) const override; |
395 | |
396 | /// verifyInstruction - Perform target specific instruction verification. |
397 | bool verifyInstruction(const MachineInstr &MI, |
398 | StringRef &ErrInfo) const override; |
399 | |
400 | virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI) const = 0; |
401 | |
402 | void expandMEMCPY(MachineBasicBlock::iterator) const; |
403 | |
404 | /// Identify instructions that can be folded into a MOVCC instruction, and |
405 | /// return the defining instruction. |
406 | MachineInstr *canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI, |
407 | const TargetInstrInfo *TII) const; |
408 | |
409 | private: |
410 | /// Modeling special VFP / NEON fp MLA / MLS hazards. |
411 | |
412 | /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal |
413 | /// MLx table. |
414 | DenseMap<unsigned, unsigned> MLxEntryMap; |
415 | |
416 | /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause |
417 | /// stalls when scheduled together with fp MLA / MLS opcodes. |
418 | SmallSet<unsigned, 16> MLxHazardOpcodes; |
419 | |
420 | public: |
421 | /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS |
422 | /// instruction. |
423 | bool isFpMLxInstruction(unsigned Opcode) const { |
424 | return MLxEntryMap.count(Opcode); |
425 | } |
426 | |
427 | /// isFpMLxInstruction - This version also returns the multiply opcode and the |
428 | /// addition / subtraction opcode to expand to. Return true for 'HasLane' for |
429 | /// the MLX instructions with an extra lane operand. |
430 | bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc, |
431 | unsigned &AddSubOpc, bool &NegAcc, |
432 | bool &HasLane) const; |
433 | |
434 | /// canCauseFpMLxStall - Return true if an instruction of the specified opcode |
435 | /// will cause stalls when scheduled after (within 4-cycle window) a fp |
436 | /// MLA / MLS instruction. |
437 | bool canCauseFpMLxStall(unsigned Opcode) const { |
438 | return MLxHazardOpcodes.count(Opcode); |
439 | } |
440 | |
441 | /// Returns true if the instruction has a shift by immediate that can be |
442 | /// executed in one cycle less. |
443 | bool isSwiftFastImmShift(const MachineInstr *MI) const; |
444 | |
445 | /// Returns predicate register associated with the given frame instruction. |
446 | unsigned getFramePred(const MachineInstr &MI) const { |
447 | assert(isFrameInstr(MI))((isFrameInstr(MI)) ? static_cast<void> (0) : __assert_fail ("isFrameInstr(MI)", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/ARM/ARMBaseInstrInfo.h" , 447, __PRETTY_FUNCTION__)); |
448 | // Operands of ADJCALLSTACKDOWN/ADJCALLSTACKUP: |
449 | // - argument declared in the pattern: |
450 | // 0 - frame size |
451 | // 1 - arg of CALLSEQ_START/CALLSEQ_END |
452 | // 2 - predicate code (like ARMCC::AL) |
453 | // - added by predOps: |
454 | // 3 - predicate reg |
455 | return MI.getOperand(3).getReg(); |
456 | } |
457 | |
458 | Optional<RegImmPair> isAddImmediate(const MachineInstr &MI, |
459 | Register Reg) const override; |
460 | }; |
461 | |
462 | /// Get the operands corresponding to the given \p Pred value. By default, the |
463 | /// predicate register is assumed to be 0 (no register), but you can pass in a |
464 | /// \p PredReg if that is not the case. |
465 | static inline std::array<MachineOperand, 2> predOps(ARMCC::CondCodes Pred, |
466 | unsigned PredReg = 0) { |
467 | return {{MachineOperand::CreateImm(static_cast<int64_t>(Pred)), |
468 | MachineOperand::CreateReg(PredReg, false)}}; |
469 | } |
470 | |
471 | /// Get the operand corresponding to the conditional code result. By default, |
472 | /// this is 0 (no register). |
473 | static inline MachineOperand condCodeOp(unsigned CCReg = 0) { |
474 | return MachineOperand::CreateReg(CCReg, false); |
475 | } |
476 | |
477 | /// Get the operand corresponding to the conditional code result for Thumb1. |
478 | /// This operand will always refer to CPSR and it will have the Define flag set. |
479 | /// You can optionally set the Dead flag by means of \p isDead. |
480 | static inline MachineOperand t1CondCodeOp(bool isDead = false) { |
481 | return MachineOperand::CreateReg(ARM::CPSR, |
482 | /*Define*/ true, /*Implicit*/ false, |
483 | /*Kill*/ false, isDead); |
484 | } |
485 | |
486 | static inline |
487 | bool isUncondBranchOpcode(int Opc) { |
488 | return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B; |
489 | } |
490 | |
491 | // This table shows the VPT instruction variants, i.e. the different |
492 | // mask field encodings, see also B5.6. Predication/conditional execution in |
493 | // the ArmARM. |
494 | enum VPTMaskValue { |
495 | T = 8, // 0b1000 |
496 | TT = 4, // 0b0100 |
497 | TE = 12, // 0b1100 |
498 | TTT = 2, // 0b0010 |
499 | TTE = 6, // 0b0110 |
500 | TEE = 10, // 0b1010 |
501 | TET = 14, // 0b1110 |
502 | TTTT = 1, // 0b0001 |
503 | TTTE = 3, // 0b0011 |
504 | TTEE = 5, // 0b0101 |
505 | TTET = 7, // 0b0111 |
506 | TEEE = 9, // 0b1001 |
507 | TEET = 11, // 0b1011 |
508 | TETT = 13, // 0b1101 |
509 | TETE = 15 // 0b1111 |
510 | }; |
511 | |
512 | static inline bool isVPTOpcode(int Opc) { |
513 | return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 || |
514 | Opc == ARM::MVE_VPTv16s8 || Opc == ARM::MVE_VPTv8i16 || |
515 | Opc == ARM::MVE_VPTv8u16 || Opc == ARM::MVE_VPTv8s16 || |
516 | Opc == ARM::MVE_VPTv4i32 || Opc == ARM::MVE_VPTv4u32 || |
517 | Opc == ARM::MVE_VPTv4s32 || Opc == ARM::MVE_VPTv4f32 || |
518 | Opc == ARM::MVE_VPTv8f16 || Opc == ARM::MVE_VPTv16i8r || |
519 | Opc == ARM::MVE_VPTv16u8r || Opc == ARM::MVE_VPTv16s8r || |
520 | Opc == ARM::MVE_VPTv8i16r || Opc == ARM::MVE_VPTv8u16r || |
521 | Opc == ARM::MVE_VPTv8s16r || Opc == ARM::MVE_VPTv4i32r || |
522 | Opc == ARM::MVE_VPTv4u32r || Opc == ARM::MVE_VPTv4s32r || |
523 | Opc == ARM::MVE_VPTv4f32r || Opc == ARM::MVE_VPTv8f16r || |
524 | Opc == ARM::MVE_VPST; |
525 | } |
526 | |
527 | static inline |
528 | unsigned VCMPOpcodeToVPT(unsigned Opcode) { |
529 | switch (Opcode) { |
530 | default: |
531 | return 0; |
532 | case ARM::MVE_VCMPf32: |
533 | return ARM::MVE_VPTv4f32; |
534 | case ARM::MVE_VCMPf16: |
535 | return ARM::MVE_VPTv8f16; |
536 | case ARM::MVE_VCMPi8: |
537 | return ARM::MVE_VPTv16i8; |
538 | case ARM::MVE_VCMPi16: |
539 | return ARM::MVE_VPTv8i16; |
540 | case ARM::MVE_VCMPi32: |
541 | return ARM::MVE_VPTv4i32; |
542 | case ARM::MVE_VCMPu8: |
543 | return ARM::MVE_VPTv16u8; |
544 | case ARM::MVE_VCMPu16: |
545 | return ARM::MVE_VPTv8u16; |
546 | case ARM::MVE_VCMPu32: |
547 | return ARM::MVE_VPTv4u32; |
548 | case ARM::MVE_VCMPs8: |
549 | return ARM::MVE_VPTv16s8; |
550 | case ARM::MVE_VCMPs16: |
551 | return ARM::MVE_VPTv8s16; |
552 | case ARM::MVE_VCMPs32: |
553 | return ARM::MVE_VPTv4s32; |
554 | |
555 | case ARM::MVE_VCMPf32r: |
556 | return ARM::MVE_VPTv4f32r; |
557 | case ARM::MVE_VCMPf16r: |
558 | return ARM::MVE_VPTv8f16r; |
559 | case ARM::MVE_VCMPi8r: |
560 | return ARM::MVE_VPTv16i8r; |
561 | case ARM::MVE_VCMPi16r: |
562 | return ARM::MVE_VPTv8i16r; |
563 | case ARM::MVE_VCMPi32r: |
564 | return ARM::MVE_VPTv4i32r; |
565 | case ARM::MVE_VCMPu8r: |
566 | return ARM::MVE_VPTv16u8r; |
567 | case ARM::MVE_VCMPu16r: |
568 | return ARM::MVE_VPTv8u16r; |
569 | case ARM::MVE_VCMPu32r: |
570 | return ARM::MVE_VPTv4u32r; |
571 | case ARM::MVE_VCMPs8r: |
572 | return ARM::MVE_VPTv16s8r; |
573 | case ARM::MVE_VCMPs16r: |
574 | return ARM::MVE_VPTv8s16r; |
575 | case ARM::MVE_VCMPs32r: |
576 | return ARM::MVE_VPTv4s32r; |
577 | } |
578 | } |
579 | |
580 | static inline |
581 | unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) { |
582 | switch (Opcode) { |
583 | default: |
584 | llvm_unreachable("unhandled vctp opcode")::llvm::llvm_unreachable_internal("unhandled vctp opcode", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Target/ARM/ARMBaseInstrInfo.h" , 584); |
585 | break; |
586 | case ARM::MVE_VCTP8: |
587 | return IsDoLoop ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8; |
588 | case ARM::MVE_VCTP16: |
589 | return IsDoLoop ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16; |
590 | case ARM::MVE_VCTP32: |
591 | return IsDoLoop ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32; |
592 | case ARM::MVE_VCTP64: |
593 | return IsDoLoop ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64; |
594 | } |
595 | return 0; |
596 | } |
597 | |
598 | static inline |
599 | bool isVCTP(MachineInstr *MI) { |
600 | switch (MI->getOpcode()) { |
601 | default: |
602 | break; |
603 | case ARM::MVE_VCTP8: |
604 | case ARM::MVE_VCTP16: |
605 | case ARM::MVE_VCTP32: |
606 | case ARM::MVE_VCTP64: |
607 | return true; |
608 | } |
609 | return false; |
610 | } |
611 | |
612 | static inline |
613 | bool isLoopStart(MachineInstr &MI) { |
614 | return MI.getOpcode() == ARM::t2DoLoopStart || |
615 | MI.getOpcode() == ARM::t2WhileLoopStart; |
616 | } |
617 | |
618 | static inline |
619 | bool isCondBranchOpcode(int Opc) { |
620 | return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc; |
621 | } |
622 | |
623 | static inline bool isJumpTableBranchOpcode(int Opc) { |
624 | return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm_i12 || |
625 | Opc == ARM::BR_JTm_rs || Opc == ARM::BR_JTadd || Opc == ARM::tBR_JTr || |
626 | Opc == ARM::t2BR_JT; |
627 | } |
628 | |
629 | static inline |
630 | bool isIndirectBranchOpcode(int Opc) { |
631 | return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; |
632 | } |
633 | |
634 | static inline bool isPopOpcode(int Opc) { |
635 | return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET || |
636 | Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD || |
637 | Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD; |
638 | } |
639 | |
640 | static inline bool isPushOpcode(int Opc) { |
641 | return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD || |
642 | Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD; |
643 | } |
644 | |
645 | /// isValidCoprocessorNumber - decide whether an explicit coprocessor |
646 | /// number is legal in generic instructions like CDP. The answer can |
647 | /// vary with the subtarget. |
648 | static inline bool isValidCoprocessorNumber(unsigned Num, |
649 | const FeatureBitset& featureBits) { |
650 | // Armv8-A disallows everything *other* than 111x (CP14 and CP15). |
651 | if (featureBits[ARM::HasV8Ops] && (Num & 0xE) != 0xE) |
652 | return false; |
653 | |
654 | // Armv7 disallows 101x (CP10 and CP11), which clash with VFP/NEON. |
655 | if (featureBits[ARM::HasV7Ops] && (Num & 0xE) == 0xA) |
656 | return false; |
657 | |
658 | // Armv8.1-M also disallows 100x (CP8,CP9) and 111x (CP14,CP15) |
659 | // which clash with MVE. |
660 | if (featureBits[ARM::HasV8_1MMainlineOps] && |
661 | ((Num & 0xE) == 0x8 || (Num & 0xE) == 0xE)) |
662 | return false; |
663 | |
664 | return true; |
665 | } |
666 | |
667 | /// getInstrPredicate - If instruction is predicated, returns its predicate |
668 | /// condition, otherwise returns AL. It also returns the condition code |
669 | /// register by reference. |
670 | ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg); |
671 | |
672 | unsigned getMatchingCondBranchOpcode(unsigned Opc); |
673 | |
674 | /// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether |
675 | /// the instruction is encoded with an 'S' bit is determined by the optional |
676 | /// CPSR def operand. |
677 | unsigned convertAddSubFlagsOpcode(unsigned OldOpc); |
678 | |
679 | /// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of |
680 | /// instructions to materializea destreg = basereg + immediate in ARM / Thumb2 |
681 | /// code. |
682 | void emitARMRegPlusImmediate(MachineBasicBlock &MBB, |
683 | MachineBasicBlock::iterator &MBBI, |
684 | const DebugLoc &dl, unsigned DestReg, |
685 | unsigned BaseReg, int NumBytes, |
686 | ARMCC::CondCodes Pred, unsigned PredReg, |
687 | const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); |
688 | |
689 | void emitT2RegPlusImmediate(MachineBasicBlock &MBB, |
690 | MachineBasicBlock::iterator &MBBI, |
691 | const DebugLoc &dl, unsigned DestReg, |
692 | unsigned BaseReg, int NumBytes, |
693 | ARMCC::CondCodes Pred, unsigned PredReg, |
694 | const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); |
695 | void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, |
696 | MachineBasicBlock::iterator &MBBI, |
697 | const DebugLoc &dl, unsigned DestReg, |
698 | unsigned BaseReg, int NumBytes, |
699 | const TargetInstrInfo &TII, |
700 | const ARMBaseRegisterInfo &MRI, |
701 | unsigned MIFlags = 0); |
702 | |
703 | /// Tries to add registers to the reglist of a given base-updating |
704 | /// push/pop instruction to adjust the stack by an additional |
705 | /// NumBytes. This can save a few bytes per function in code-size, but |
706 | /// obviously generates more memory traffic. As such, it only takes |
707 | /// effect in functions being optimised for size. |
708 | bool tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, |
709 | MachineFunction &MF, MachineInstr *MI, |
710 | unsigned NumBytes); |
711 | |
712 | /// rewriteARMFrameIndex / rewriteT2FrameIndex - |
713 | /// Rewrite MI to access 'Offset' bytes from the FP. Return false if the |
714 | /// offset could not be handled directly in MI, and return the left-over |
715 | /// portion by reference. |
716 | bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, |
717 | unsigned FrameReg, int &Offset, |
718 | const ARMBaseInstrInfo &TII); |
719 | |
720 | bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, |
721 | unsigned FrameReg, int &Offset, |
722 | const ARMBaseInstrInfo &TII, |
723 | const TargetRegisterInfo *TRI); |
724 | |
725 | /// Return true if Reg is defd between From and To |
726 | bool registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From, |
727 | MachineBasicBlock::iterator To, |
728 | const TargetRegisterInfo *TRI); |
729 | |
730 | /// Search backwards from a tBcc to find a tCMPi8 against 0, meaning |
731 | /// we can convert them to a tCBZ or tCBNZ. Return nullptr if not found. |
732 | MachineInstr *findCMPToFoldIntoCBZ(MachineInstr *Br, |
733 | const TargetRegisterInfo *TRI); |
734 | |
735 | void addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB); |
736 | void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned DestReg); |
737 | |
738 | void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond); |
739 | void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond, |
740 | unsigned Inactive); |
741 | |
742 | /// Returns the number of instructions required to materialize the given |
743 | /// constant in a register, or 3 if a literal pool load is needed. |
744 | /// If ForCodesize is specified, an approximate cost in bytes is returned. |
745 | unsigned ConstantMaterializationCost(unsigned Val, |
746 | const ARMSubtarget *Subtarget, |
747 | bool ForCodesize = false); |
748 | |
749 | /// Returns true if Val1 has a lower Constant Materialization Cost than Val2. |
750 | /// Uses the cost from ConstantMaterializationCost, first with ForCodesize as |
751 | /// specified. If the scores are equal, return the comparison for !ForCodesize. |
752 | bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, |
753 | const ARMSubtarget *Subtarget, |
754 | bool ForCodesize = false); |
755 | |
756 | } // end namespace llvm |
757 | |
758 | #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H |