File: | lib/Transforms/Vectorize/BBVectorize.cpp |
Location: | line 2629, column 11 |
Description: | Value stored to 'I2T' is never read |
1 | //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// |
2 | // |
3 | // The LLVM Compiler Infrastructure |
4 | // |
5 | // This file is distributed under the University of Illinois Open Source |
6 | // License. See LICENSE.TXT for details. |
7 | // |
8 | //===----------------------------------------------------------------------===// |
9 | // |
10 | // This file implements a basic-block vectorization pass. The algorithm was |
11 | // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, |
12 | // et al. It works by looking for chains of pairable operations and then |
13 | // pairing them. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #define BBV_NAME"bb-vectorize" "bb-vectorize" |
18 | #include "llvm/Transforms/Vectorize.h" |
19 | #include "llvm/ADT/DenseMap.h" |
20 | #include "llvm/ADT/DenseSet.h" |
21 | #include "llvm/ADT/STLExtras.h" |
22 | #include "llvm/ADT/SmallSet.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/ADT/Statistic.h" |
25 | #include "llvm/ADT/StringExtras.h" |
26 | #include "llvm/Analysis/AliasAnalysis.h" |
27 | #include "llvm/Analysis/AliasSetTracker.h" |
28 | #include "llvm/Analysis/ScalarEvolution.h" |
29 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
30 | #include "llvm/Analysis/TargetTransformInfo.h" |
31 | #include "llvm/Analysis/ValueTracking.h" |
32 | #include "llvm/IR/Constants.h" |
33 | #include "llvm/IR/DataLayout.h" |
34 | #include "llvm/IR/DerivedTypes.h" |
35 | #include "llvm/IR/Dominators.h" |
36 | #include "llvm/IR/Function.h" |
37 | #include "llvm/IR/Instructions.h" |
38 | #include "llvm/IR/IntrinsicInst.h" |
39 | #include "llvm/IR/Intrinsics.h" |
40 | #include "llvm/IR/LLVMContext.h" |
41 | #include "llvm/IR/Metadata.h" |
42 | #include "llvm/IR/Type.h" |
43 | #include "llvm/IR/ValueHandle.h" |
44 | #include "llvm/Pass.h" |
45 | #include "llvm/Support/CommandLine.h" |
46 | #include "llvm/Support/Debug.h" |
47 | #include "llvm/Support/raw_ostream.h" |
48 | #include "llvm/Transforms/Utils/Local.h" |
49 | #include <algorithm> |
50 | using namespace llvm; |
51 | |
52 | #define DEBUG_TYPE"bb-vectorize" BBV_NAME"bb-vectorize" |
53 | |
54 | static cl::opt<bool> |
55 | IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), |
56 | cl::Hidden, cl::desc("Ignore target information")); |
57 | |
58 | static cl::opt<unsigned> |
59 | ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, |
60 | cl::desc("The required chain depth for vectorization")); |
61 | |
62 | static cl::opt<bool> |
63 | UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), |
64 | cl::Hidden, cl::desc("Use the chain depth requirement with" |
65 | " target information")); |
66 | |
67 | static cl::opt<unsigned> |
68 | SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, |
69 | cl::desc("The maximum search distance for instruction pairs")); |
70 | |
71 | static cl::opt<bool> |
72 | SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, |
73 | cl::desc("Replicating one element to a pair breaks the chain")); |
74 | |
75 | static cl::opt<unsigned> |
76 | VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, |
77 | cl::desc("The size of the native vector registers")); |
78 | |
79 | static cl::opt<unsigned> |
80 | MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, |
81 | cl::desc("The maximum number of pairing iterations")); |
82 | |
83 | static cl::opt<bool> |
84 | Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, |
85 | cl::desc("Don't try to form non-2^n-length vectors")); |
86 | |
87 | static cl::opt<unsigned> |
88 | MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, |
89 | cl::desc("The maximum number of pairable instructions per group")); |
90 | |
91 | static cl::opt<unsigned> |
92 | MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, |
93 | cl::desc("The maximum number of candidate instruction pairs per group")); |
94 | |
95 | static cl::opt<unsigned> |
96 | MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), |
97 | cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" |
98 | " a full cycle check")); |
99 | |
100 | static cl::opt<bool> |
101 | NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, |
102 | cl::desc("Don't try to vectorize boolean (i1) values")); |
103 | |
104 | static cl::opt<bool> |
105 | NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, |
106 | cl::desc("Don't try to vectorize integer values")); |
107 | |
108 | static cl::opt<bool> |
109 | NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, |
110 | cl::desc("Don't try to vectorize floating-point values")); |
111 | |
112 | // FIXME: This should default to false once pointer vector support works. |
113 | static cl::opt<bool> |
114 | NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, |
115 | cl::desc("Don't try to vectorize pointer values")); |
116 | |
117 | static cl::opt<bool> |
118 | NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, |
119 | cl::desc("Don't try to vectorize casting (conversion) operations")); |
120 | |
121 | static cl::opt<bool> |
122 | NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, |
123 | cl::desc("Don't try to vectorize floating-point math intrinsics")); |
124 | |
125 | static cl::opt<bool> |
126 | NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden, |
127 | cl::desc("Don't try to vectorize BitManipulation intrinsics")); |
128 | |
129 | static cl::opt<bool> |
130 | NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, |
131 | cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); |
132 | |
133 | static cl::opt<bool> |
134 | NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, |
135 | cl::desc("Don't try to vectorize select instructions")); |
136 | |
137 | static cl::opt<bool> |
138 | NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, |
139 | cl::desc("Don't try to vectorize comparison instructions")); |
140 | |
141 | static cl::opt<bool> |
142 | NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, |
143 | cl::desc("Don't try to vectorize getelementptr instructions")); |
144 | |
145 | static cl::opt<bool> |
146 | NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, |
147 | cl::desc("Don't try to vectorize loads and stores")); |
148 | |
149 | static cl::opt<bool> |
150 | AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, |
151 | cl::desc("Only generate aligned loads and stores")); |
152 | |
153 | static cl::opt<bool> |
154 | NoMemOpBoost("bb-vectorize-no-mem-op-boost", |
155 | cl::init(false), cl::Hidden, |
156 | cl::desc("Don't boost the chain-depth contribution of loads and stores")); |
157 | |
158 | static cl::opt<bool> |
159 | FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, |
160 | cl::desc("Use a fast instruction dependency analysis")); |
161 | |
162 | #ifndef NDEBUG |
163 | static cl::opt<bool> |
164 | DebugInstructionExamination("bb-vectorize-debug-instruction-examination", |
165 | cl::init(false), cl::Hidden, |
166 | cl::desc("When debugging is enabled, output information on the" |
167 | " instruction-examination process")); |
168 | static cl::opt<bool> |
169 | DebugCandidateSelection("bb-vectorize-debug-candidate-selection", |
170 | cl::init(false), cl::Hidden, |
171 | cl::desc("When debugging is enabled, output information on the" |
172 | " candidate-selection process")); |
173 | static cl::opt<bool> |
174 | DebugPairSelection("bb-vectorize-debug-pair-selection", |
175 | cl::init(false), cl::Hidden, |
176 | cl::desc("When debugging is enabled, output information on the" |
177 | " pair-selection process")); |
178 | static cl::opt<bool> |
179 | DebugCycleCheck("bb-vectorize-debug-cycle-check", |
180 | cl::init(false), cl::Hidden, |
181 | cl::desc("When debugging is enabled, output information on the" |
182 | " cycle-checking process")); |
183 | |
184 | static cl::opt<bool> |
185 | PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", |
186 | cl::init(false), cl::Hidden, |
187 | cl::desc("When debugging is enabled, dump the basic block after" |
188 | " every pair is fused")); |
189 | #endif |
190 | |
191 | STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize")static llvm::Statistic NumFusedOps = { "bb-vectorize", "Number of operations fused by bb-vectorize" , 0, 0 }; |
192 | |
193 | namespace { |
194 | struct BBVectorize : public BasicBlockPass { |
195 | static char ID; // Pass identification, replacement for typeid |
196 | |
197 | const VectorizeConfig Config; |
198 | |
199 | BBVectorize(const VectorizeConfig &C = VectorizeConfig()) |
200 | : BasicBlockPass(ID), Config(C) { |
201 | initializeBBVectorizePass(*PassRegistry::getPassRegistry()); |
202 | } |
203 | |
204 | BBVectorize(Pass *P, const VectorizeConfig &C) |
205 | : BasicBlockPass(ID), Config(C) { |
206 | AA = &P->getAnalysis<AliasAnalysis>(); |
207 | DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
208 | SE = &P->getAnalysis<ScalarEvolution>(); |
209 | DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>(); |
210 | DL = DLP ? &DLP->getDataLayout() : nullptr; |
211 | TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfo>(); |
212 | } |
213 | |
214 | typedef std::pair<Value *, Value *> ValuePair; |
215 | typedef std::pair<ValuePair, int> ValuePairWithCost; |
216 | typedef std::pair<ValuePair, size_t> ValuePairWithDepth; |
217 | typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair |
218 | typedef std::pair<VPPair, unsigned> VPPairWithType; |
219 | |
220 | AliasAnalysis *AA; |
221 | DominatorTree *DT; |
222 | ScalarEvolution *SE; |
223 | const DataLayout *DL; |
224 | const TargetTransformInfo *TTI; |
225 | |
226 | // FIXME: const correct? |
227 | |
228 | bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); |
229 | |
230 | bool getCandidatePairs(BasicBlock &BB, |
231 | BasicBlock::iterator &Start, |
232 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
233 | DenseSet<ValuePair> &FixedOrderPairs, |
234 | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
235 | std::vector<Value *> &PairableInsts, bool NonPow2Len); |
236 | |
237 | // FIXME: The current implementation does not account for pairs that |
238 | // are connected in multiple ways. For example: |
239 | // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) |
240 | enum PairConnectionType { |
241 | PairConnectionDirect, |
242 | PairConnectionSwap, |
243 | PairConnectionSplat |
244 | }; |
245 | |
246 | void computeConnectedPairs( |
247 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
248 | DenseSet<ValuePair> &CandidatePairsSet, |
249 | std::vector<Value *> &PairableInsts, |
250 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
251 | DenseMap<VPPair, unsigned> &PairConnectionTypes); |
252 | |
253 | void buildDepMap(BasicBlock &BB, |
254 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
255 | std::vector<Value *> &PairableInsts, |
256 | DenseSet<ValuePair> &PairableInstUsers); |
257 | |
258 | void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
259 | DenseSet<ValuePair> &CandidatePairsSet, |
260 | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
261 | std::vector<Value *> &PairableInsts, |
262 | DenseSet<ValuePair> &FixedOrderPairs, |
263 | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
264 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
265 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, |
266 | DenseSet<ValuePair> &PairableInstUsers, |
267 | DenseMap<Value *, Value *>& ChosenPairs); |
268 | |
269 | void fuseChosenPairs(BasicBlock &BB, |
270 | std::vector<Value *> &PairableInsts, |
271 | DenseMap<Value *, Value *>& ChosenPairs, |
272 | DenseSet<ValuePair> &FixedOrderPairs, |
273 | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
274 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
275 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps); |
276 | |
277 | |
278 | bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); |
279 | |
280 | bool areInstsCompatible(Instruction *I, Instruction *J, |
281 | bool IsSimpleLoadStore, bool NonPow2Len, |
282 | int &CostSavings, int &FixedOrder); |
283 | |
284 | bool trackUsesOfI(DenseSet<Value *> &Users, |
285 | AliasSetTracker &WriteSet, Instruction *I, |
286 | Instruction *J, bool UpdateUsers = true, |
287 | DenseSet<ValuePair> *LoadMoveSetPairs = nullptr); |
288 | |
289 | void computePairsConnectedTo( |
290 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
291 | DenseSet<ValuePair> &CandidatePairsSet, |
292 | std::vector<Value *> &PairableInsts, |
293 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
294 | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
295 | ValuePair P); |
296 | |
297 | bool pairsConflict(ValuePair P, ValuePair Q, |
298 | DenseSet<ValuePair> &PairableInstUsers, |
299 | DenseMap<ValuePair, std::vector<ValuePair> > |
300 | *PairableInstUserMap = nullptr, |
301 | DenseSet<VPPair> *PairableInstUserPairSet = nullptr); |
302 | |
303 | bool pairWillFormCycle(ValuePair P, |
304 | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers, |
305 | DenseSet<ValuePair> &CurrentPairs); |
306 | |
307 | void pruneDAGFor( |
308 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
309 | std::vector<Value *> &PairableInsts, |
310 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
311 | DenseSet<ValuePair> &PairableInstUsers, |
312 | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, |
313 | DenseSet<VPPair> &PairableInstUserPairSet, |
314 | DenseMap<Value *, Value *> &ChosenPairs, |
315 | DenseMap<ValuePair, size_t> &DAG, |
316 | DenseSet<ValuePair> &PrunedDAG, ValuePair J, |
317 | bool UseCycleCheck); |
318 | |
319 | void buildInitialDAGFor( |
320 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
321 | DenseSet<ValuePair> &CandidatePairsSet, |
322 | std::vector<Value *> &PairableInsts, |
323 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
324 | DenseSet<ValuePair> &PairableInstUsers, |
325 | DenseMap<Value *, Value *> &ChosenPairs, |
326 | DenseMap<ValuePair, size_t> &DAG, ValuePair J); |
327 | |
328 | void findBestDAGFor( |
329 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
330 | DenseSet<ValuePair> &CandidatePairsSet, |
331 | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
332 | std::vector<Value *> &PairableInsts, |
333 | DenseSet<ValuePair> &FixedOrderPairs, |
334 | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
335 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
336 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, |
337 | DenseSet<ValuePair> &PairableInstUsers, |
338 | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, |
339 | DenseSet<VPPair> &PairableInstUserPairSet, |
340 | DenseMap<Value *, Value *> &ChosenPairs, |
341 | DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, |
342 | int &BestEffSize, Value *II, std::vector<Value *>&JJ, |
343 | bool UseCycleCheck); |
344 | |
345 | Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, |
346 | Instruction *J, unsigned o); |
347 | |
348 | void fillNewShuffleMask(LLVMContext& Context, Instruction *J, |
349 | unsigned MaskOffset, unsigned NumInElem, |
350 | unsigned NumInElem1, unsigned IdxOffset, |
351 | std::vector<Constant*> &Mask); |
352 | |
353 | Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, |
354 | Instruction *J); |
355 | |
356 | bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, |
357 | unsigned o, Value *&LOp, unsigned numElemL, |
358 | Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, |
359 | unsigned IdxOff = 0); |
360 | |
361 | Value *getReplacementInput(LLVMContext& Context, Instruction *I, |
362 | Instruction *J, unsigned o, bool IBeforeJ); |
363 | |
364 | void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, |
365 | Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands, |
366 | bool IBeforeJ); |
367 | |
368 | void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, |
369 | Instruction *J, Instruction *K, |
370 | Instruction *&InsertionPt, Instruction *&K1, |
371 | Instruction *&K2); |
372 | |
373 | void collectPairLoadMoveSet(BasicBlock &BB, |
374 | DenseMap<Value *, Value *> &ChosenPairs, |
375 | DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, |
376 | DenseSet<ValuePair> &LoadMoveSetPairs, |
377 | Instruction *I); |
378 | |
379 | void collectLoadMoveSet(BasicBlock &BB, |
380 | std::vector<Value *> &PairableInsts, |
381 | DenseMap<Value *, Value *> &ChosenPairs, |
382 | DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, |
383 | DenseSet<ValuePair> &LoadMoveSetPairs); |
384 | |
385 | bool canMoveUsesOfIAfterJ(BasicBlock &BB, |
386 | DenseSet<ValuePair> &LoadMoveSetPairs, |
387 | Instruction *I, Instruction *J); |
388 | |
389 | void moveUsesOfIAfterJ(BasicBlock &BB, |
390 | DenseSet<ValuePair> &LoadMoveSetPairs, |
391 | Instruction *&InsertionPt, |
392 | Instruction *I, Instruction *J); |
393 | |
394 | bool vectorizeBB(BasicBlock &BB) { |
395 | if (skipOptnoneFunction(BB)) |
396 | return false; |
397 | if (!DT->isReachableFromEntry(&BB)) { |
398 | DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: skipping unreachable " << BB.getName() << " in " << BB.getParent( )->getName() << "\n"; } } while (0) |
399 | " in " << BB.getParent()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: skipping unreachable " << BB.getName() << " in " << BB.getParent( )->getName() << "\n"; } } while (0); |
400 | return false; |
401 | } |
402 | |
403 | DEBUG(if (TTI) dbgs() << "BBV: using target information\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (TTI) dbgs() << "BBV: using target information\n" ; } } while (0); |
404 | |
405 | bool changed = false; |
406 | // Iterate a sufficient number of times to merge types of size 1 bit, |
407 | // then 2 bits, then 4, etc. up to half of the target vector width of the |
408 | // target vector register. |
409 | unsigned n = 1; |
410 | for (unsigned v = 2; |
411 | (TTI || v <= Config.VectorBits) && |
412 | (!Config.MaxIter || n <= Config.MaxIter); |
413 | v *= 2, ++n) { |
414 | DEBUG(dbgs() << "BBV: fusing loop #" << n <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusing loop #" << n << " for " << BB.getName() << " in " << BB.getParent()->getName() << "...\n"; } } while (0) |
415 | " for " << BB.getName() << " in " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusing loop #" << n << " for " << BB.getName() << " in " << BB.getParent()->getName() << "...\n"; } } while (0) |
416 | BB.getParent()->getName() << "...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusing loop #" << n << " for " << BB.getName() << " in " << BB.getParent()->getName() << "...\n"; } } while (0); |
417 | if (vectorizePairs(BB)) |
418 | changed = true; |
419 | else |
420 | break; |
421 | } |
422 | |
423 | if (changed && !Pow2LenOnly) { |
424 | ++n; |
425 | for (; !Config.MaxIter || n <= Config.MaxIter; ++n) { |
426 | DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << n << " for " << BB.getName() << " in " << BB.getParent()->getName() << "...\n"; } } while (0) |
427 | n << " for " << BB.getName() << " in " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << n << " for " << BB.getName() << " in " << BB.getParent()->getName() << "...\n"; } } while (0) |
428 | BB.getParent()->getName() << "...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << n << " for " << BB.getName() << " in " << BB.getParent()->getName() << "...\n"; } } while (0); |
429 | if (!vectorizePairs(BB, true)) break; |
430 | } |
431 | } |
432 | |
433 | DEBUG(dbgs() << "BBV: done!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: done!\n"; } } while (0); |
434 | return changed; |
435 | } |
436 | |
437 | bool runOnBasicBlock(BasicBlock &BB) override { |
438 | // OptimizeNone check deferred to vectorizeBB(). |
439 | |
440 | AA = &getAnalysis<AliasAnalysis>(); |
441 | DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
442 | SE = &getAnalysis<ScalarEvolution>(); |
443 | DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); |
444 | DL = DLP ? &DLP->getDataLayout() : nullptr; |
445 | TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfo>(); |
446 | |
447 | return vectorizeBB(BB); |
448 | } |
449 | |
450 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
451 | BasicBlockPass::getAnalysisUsage(AU); |
452 | AU.addRequired<AliasAnalysis>(); |
453 | AU.addRequired<DominatorTreeWrapperPass>(); |
454 | AU.addRequired<ScalarEvolution>(); |
455 | AU.addRequired<TargetTransformInfo>(); |
456 | AU.addPreserved<AliasAnalysis>(); |
457 | AU.addPreserved<DominatorTreeWrapperPass>(); |
458 | AU.addPreserved<ScalarEvolution>(); |
459 | AU.setPreservesCFG(); |
460 | } |
461 | |
462 | static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { |
463 | assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&((ElemTy->getScalarType() == Elem2Ty->getScalarType() && "Cannot form vector from incompatible scalar types") ? static_cast <void> (0) : __assert_fail ("ElemTy->getScalarType() == Elem2Ty->getScalarType() && \"Cannot form vector from incompatible scalar types\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 464, __PRETTY_FUNCTION__)) |
464 | "Cannot form vector from incompatible scalar types")((ElemTy->getScalarType() == Elem2Ty->getScalarType() && "Cannot form vector from incompatible scalar types") ? static_cast <void> (0) : __assert_fail ("ElemTy->getScalarType() == Elem2Ty->getScalarType() && \"Cannot form vector from incompatible scalar types\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 464, __PRETTY_FUNCTION__)); |
465 | Type *STy = ElemTy->getScalarType(); |
466 | |
467 | unsigned numElem; |
468 | if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { |
469 | numElem = VTy->getNumElements(); |
470 | } else { |
471 | numElem = 1; |
472 | } |
473 | |
474 | if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) { |
475 | numElem += VTy->getNumElements(); |
476 | } else { |
477 | numElem += 1; |
478 | } |
479 | |
480 | return VectorType::get(STy, numElem); |
481 | } |
482 | |
483 | static inline void getInstructionTypes(Instruction *I, |
484 | Type *&T1, Type *&T2) { |
485 | if (StoreInst *SI = dyn_cast<StoreInst>(I)) { |
486 | // For stores, it is the value type, not the pointer type that matters |
487 | // because the value is what will come from a vector register. |
488 | |
489 | Value *IVal = SI->getValueOperand(); |
490 | T1 = IVal->getType(); |
491 | } else { |
492 | T1 = I->getType(); |
493 | } |
494 | |
495 | if (CastInst *CI = dyn_cast<CastInst>(I)) |
496 | T2 = CI->getSrcTy(); |
497 | else |
498 | T2 = T1; |
499 | |
500 | if (SelectInst *SI = dyn_cast<SelectInst>(I)) { |
501 | T2 = SI->getCondition()->getType(); |
502 | } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { |
503 | T2 = SI->getOperand(0)->getType(); |
504 | } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { |
505 | T2 = CI->getOperand(0)->getType(); |
506 | } |
507 | } |
508 | |
509 | // Returns the weight associated with the provided value. A chain of |
510 | // candidate pairs has a length given by the sum of the weights of its |
511 | // members (one weight per pair; the weight of each member of the pair |
512 | // is assumed to be the same). This length is then compared to the |
513 | // chain-length threshold to determine if a given chain is significant |
514 | // enough to be vectorized. The length is also used in comparing |
515 | // candidate chains where longer chains are considered to be better. |
516 | // Note: when this function returns 0, the resulting instructions are |
517 | // not actually fused. |
518 | inline size_t getDepthFactor(Value *V) { |
519 | // InsertElement and ExtractElement have a depth factor of zero. This is |
520 | // for two reasons: First, they cannot be usefully fused. Second, because |
521 | // the pass generates a lot of these, they can confuse the simple metric |
522 | // used to compare the dags in the next iteration. Thus, giving them a |
523 | // weight of zero allows the pass to essentially ignore them in |
524 | // subsequent iterations when looking for vectorization opportunities |
525 | // while still tracking dependency chains that flow through those |
526 | // instructions. |
527 | if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) |
528 | return 0; |
529 | |
530 | // Give a load or store half of the required depth so that load/store |
531 | // pairs will vectorize. |
532 | if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) |
533 | return Config.ReqChainDepth/2; |
534 | |
535 | return 1; |
536 | } |
537 | |
538 | // Returns the cost of the provided instruction using TTI. |
539 | // This does not handle loads and stores. |
540 | unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2, |
541 | TargetTransformInfo::OperandValueKind Op1VK = |
542 | TargetTransformInfo::OK_AnyValue, |
543 | TargetTransformInfo::OperandValueKind Op2VK = |
544 | TargetTransformInfo::OK_AnyValue) { |
545 | switch (Opcode) { |
546 | default: break; |
547 | case Instruction::GetElementPtr: |
548 | // We mark this instruction as zero-cost because scalar GEPs are usually |
549 | // lowered to the instruction addressing mode. At the moment we don't |
550 | // generate vector GEPs. |
551 | return 0; |
552 | case Instruction::Br: |
553 | return TTI->getCFInstrCost(Opcode); |
554 | case Instruction::PHI: |
555 | return 0; |
556 | case Instruction::Add: |
557 | case Instruction::FAdd: |
558 | case Instruction::Sub: |
559 | case Instruction::FSub: |
560 | case Instruction::Mul: |
561 | case Instruction::FMul: |
562 | case Instruction::UDiv: |
563 | case Instruction::SDiv: |
564 | case Instruction::FDiv: |
565 | case Instruction::URem: |
566 | case Instruction::SRem: |
567 | case Instruction::FRem: |
568 | case Instruction::Shl: |
569 | case Instruction::LShr: |
570 | case Instruction::AShr: |
571 | case Instruction::And: |
572 | case Instruction::Or: |
573 | case Instruction::Xor: |
574 | return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK); |
575 | case Instruction::Select: |
576 | case Instruction::ICmp: |
577 | case Instruction::FCmp: |
578 | return TTI->getCmpSelInstrCost(Opcode, T1, T2); |
579 | case Instruction::ZExt: |
580 | case Instruction::SExt: |
581 | case Instruction::FPToUI: |
582 | case Instruction::FPToSI: |
583 | case Instruction::FPExt: |
584 | case Instruction::PtrToInt: |
585 | case Instruction::IntToPtr: |
586 | case Instruction::SIToFP: |
587 | case Instruction::UIToFP: |
588 | case Instruction::Trunc: |
589 | case Instruction::FPTrunc: |
590 | case Instruction::BitCast: |
591 | case Instruction::ShuffleVector: |
592 | return TTI->getCastInstrCost(Opcode, T1, T2); |
593 | } |
594 | |
595 | return 1; |
596 | } |
597 | |
598 | // This determines the relative offset of two loads or stores, returning |
599 | // true if the offset could be determined to be some constant value. |
600 | // For example, if OffsetInElmts == 1, then J accesses the memory directly |
601 | // after I; if OffsetInElmts == -1 then I accesses the memory |
602 | // directly after J. |
603 | bool getPairPtrInfo(Instruction *I, Instruction *J, |
604 | Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, |
605 | unsigned &IAddressSpace, unsigned &JAddressSpace, |
606 | int64_t &OffsetInElmts, bool ComputeOffset = true) { |
607 | OffsetInElmts = 0; |
608 | if (LoadInst *LI = dyn_cast<LoadInst>(I)) { |
609 | LoadInst *LJ = cast<LoadInst>(J); |
610 | IPtr = LI->getPointerOperand(); |
611 | JPtr = LJ->getPointerOperand(); |
612 | IAlignment = LI->getAlignment(); |
613 | JAlignment = LJ->getAlignment(); |
614 | IAddressSpace = LI->getPointerAddressSpace(); |
615 | JAddressSpace = LJ->getPointerAddressSpace(); |
616 | } else { |
617 | StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); |
618 | IPtr = SI->getPointerOperand(); |
619 | JPtr = SJ->getPointerOperand(); |
620 | IAlignment = SI->getAlignment(); |
621 | JAlignment = SJ->getAlignment(); |
622 | IAddressSpace = SI->getPointerAddressSpace(); |
623 | JAddressSpace = SJ->getPointerAddressSpace(); |
624 | } |
625 | |
626 | if (!ComputeOffset) |
627 | return true; |
628 | |
629 | const SCEV *IPtrSCEV = SE->getSCEV(IPtr); |
630 | const SCEV *JPtrSCEV = SE->getSCEV(JPtr); |
631 | |
632 | // If this is a trivial offset, then we'll get something like |
633 | // 1*sizeof(type). With target data, which we need anyway, this will get |
634 | // constant folded into a number. |
635 | const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); |
636 | if (const SCEVConstant *ConstOffSCEV = |
637 | dyn_cast<SCEVConstant>(OffsetSCEV)) { |
638 | ConstantInt *IntOff = ConstOffSCEV->getValue(); |
639 | int64_t Offset = IntOff->getSExtValue(); |
640 | |
641 | Type *VTy = IPtr->getType()->getPointerElementType(); |
642 | int64_t VTyTSS = (int64_t) DL->getTypeStoreSize(VTy); |
643 | |
644 | Type *VTy2 = JPtr->getType()->getPointerElementType(); |
645 | if (VTy != VTy2 && Offset < 0) { |
646 | int64_t VTy2TSS = (int64_t) DL->getTypeStoreSize(VTy2); |
647 | OffsetInElmts = Offset/VTy2TSS; |
648 | return (abs64(Offset) % VTy2TSS) == 0; |
649 | } |
650 | |
651 | OffsetInElmts = Offset/VTyTSS; |
652 | return (abs64(Offset) % VTyTSS) == 0; |
653 | } |
654 | |
655 | return false; |
656 | } |
657 | |
658 | // Returns true if the provided CallInst represents an intrinsic that can |
659 | // be vectorized. |
660 | bool isVectorizableIntrinsic(CallInst* I) { |
661 | Function *F = I->getCalledFunction(); |
662 | if (!F) return false; |
663 | |
664 | Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); |
665 | if (!IID) return false; |
666 | |
667 | switch(IID) { |
668 | default: |
669 | return false; |
670 | case Intrinsic::sqrt: |
671 | case Intrinsic::powi: |
672 | case Intrinsic::sin: |
673 | case Intrinsic::cos: |
674 | case Intrinsic::log: |
675 | case Intrinsic::log2: |
676 | case Intrinsic::log10: |
677 | case Intrinsic::exp: |
678 | case Intrinsic::exp2: |
679 | case Intrinsic::pow: |
680 | case Intrinsic::round: |
681 | case Intrinsic::copysign: |
682 | case Intrinsic::ceil: |
683 | case Intrinsic::nearbyint: |
684 | case Intrinsic::rint: |
685 | case Intrinsic::trunc: |
686 | case Intrinsic::floor: |
687 | case Intrinsic::fabs: |
688 | case Intrinsic::minnum: |
689 | case Intrinsic::maxnum: |
690 | return Config.VectorizeMath; |
691 | case Intrinsic::bswap: |
692 | case Intrinsic::ctpop: |
693 | case Intrinsic::ctlz: |
694 | case Intrinsic::cttz: |
695 | return Config.VectorizeBitManipulations; |
696 | case Intrinsic::fma: |
697 | case Intrinsic::fmuladd: |
698 | return Config.VectorizeFMA; |
699 | } |
700 | } |
701 | |
702 | bool isPureIEChain(InsertElementInst *IE) { |
703 | InsertElementInst *IENext = IE; |
704 | do { |
705 | if (!isa<UndefValue>(IENext->getOperand(0)) && |
706 | !isa<InsertElementInst>(IENext->getOperand(0))) { |
707 | return false; |
708 | } |
709 | } while ((IENext = |
710 | dyn_cast<InsertElementInst>(IENext->getOperand(0)))); |
711 | |
712 | return true; |
713 | } |
714 | }; |
715 | |
716 | // This function implements one vectorization iteration on the provided |
717 | // basic block. It returns true if the block is changed. |
718 | bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { |
719 | bool ShouldContinue; |
720 | BasicBlock::iterator Start = BB.getFirstInsertionPt(); |
721 | |
722 | std::vector<Value *> AllPairableInsts; |
723 | DenseMap<Value *, Value *> AllChosenPairs; |
724 | DenseSet<ValuePair> AllFixedOrderPairs; |
725 | DenseMap<VPPair, unsigned> AllPairConnectionTypes; |
726 | DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs, |
727 | AllConnectedPairDeps; |
728 | |
729 | do { |
730 | std::vector<Value *> PairableInsts; |
731 | DenseMap<Value *, std::vector<Value *> > CandidatePairs; |
732 | DenseSet<ValuePair> FixedOrderPairs; |
733 | DenseMap<ValuePair, int> CandidatePairCostSavings; |
734 | ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, |
735 | FixedOrderPairs, |
736 | CandidatePairCostSavings, |
737 | PairableInsts, NonPow2Len); |
738 | if (PairableInsts.empty()) continue; |
739 | |
740 | // Build the candidate pair set for faster lookups. |
741 | DenseSet<ValuePair> CandidatePairsSet; |
742 | for (DenseMap<Value *, std::vector<Value *> >::iterator I = |
743 | CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I) |
744 | for (std::vector<Value *>::iterator J = I->second.begin(), |
745 | JE = I->second.end(); J != JE; ++J) |
746 | CandidatePairsSet.insert(ValuePair(I->first, *J)); |
747 | |
748 | // Now we have a map of all of the pairable instructions and we need to |
749 | // select the best possible pairing. A good pairing is one such that the |
750 | // users of the pair are also paired. This defines a (directed) forest |
751 | // over the pairs such that two pairs are connected iff the second pair |
752 | // uses the first. |
753 | |
754 | // Note that it only matters that both members of the second pair use some |
755 | // element of the first pair (to allow for splatting). |
756 | |
757 | DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs, |
758 | ConnectedPairDeps; |
759 | DenseMap<VPPair, unsigned> PairConnectionTypes; |
760 | computeConnectedPairs(CandidatePairs, CandidatePairsSet, |
761 | PairableInsts, ConnectedPairs, PairConnectionTypes); |
762 | if (ConnectedPairs.empty()) continue; |
763 | |
764 | for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator |
765 | I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); |
766 | I != IE; ++I) |
767 | for (std::vector<ValuePair>::iterator J = I->second.begin(), |
768 | JE = I->second.end(); J != JE; ++J) |
769 | ConnectedPairDeps[*J].push_back(I->first); |
770 | |
771 | // Build the pairable-instruction dependency map |
772 | DenseSet<ValuePair> PairableInstUsers; |
773 | buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); |
774 | |
775 | // There is now a graph of the connected pairs. For each variable, pick |
776 | // the pairing with the largest dag meeting the depth requirement on at |
777 | // least one branch. Then select all pairings that are part of that dag |
778 | // and remove them from the list of available pairings and pairable |
779 | // variables. |
780 | |
781 | DenseMap<Value *, Value *> ChosenPairs; |
782 | choosePairs(CandidatePairs, CandidatePairsSet, |
783 | CandidatePairCostSavings, |
784 | PairableInsts, FixedOrderPairs, PairConnectionTypes, |
785 | ConnectedPairs, ConnectedPairDeps, |
786 | PairableInstUsers, ChosenPairs); |
787 | |
788 | if (ChosenPairs.empty()) continue; |
789 | AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), |
790 | PairableInsts.end()); |
791 | AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); |
792 | |
793 | // Only for the chosen pairs, propagate information on fixed-order pairs, |
794 | // pair connections, and their types to the data structures used by the |
795 | // pair fusion procedures. |
796 | for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), |
797 | IE = ChosenPairs.end(); I != IE; ++I) { |
798 | if (FixedOrderPairs.count(*I)) |
799 | AllFixedOrderPairs.insert(*I); |
800 | else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) |
801 | AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); |
802 | |
803 | for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); |
804 | J != IE; ++J) { |
805 | DenseMap<VPPair, unsigned>::iterator K = |
806 | PairConnectionTypes.find(VPPair(*I, *J)); |
807 | if (K != PairConnectionTypes.end()) { |
808 | AllPairConnectionTypes.insert(*K); |
809 | } else { |
810 | K = PairConnectionTypes.find(VPPair(*J, *I)); |
811 | if (K != PairConnectionTypes.end()) |
812 | AllPairConnectionTypes.insert(*K); |
813 | } |
814 | } |
815 | } |
816 | |
817 | for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator |
818 | I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); |
819 | I != IE; ++I) |
820 | for (std::vector<ValuePair>::iterator J = I->second.begin(), |
821 | JE = I->second.end(); J != JE; ++J) |
822 | if (AllPairConnectionTypes.count(VPPair(I->first, *J))) { |
823 | AllConnectedPairs[I->first].push_back(*J); |
824 | AllConnectedPairDeps[*J].push_back(I->first); |
825 | } |
826 | } while (ShouldContinue); |
827 | |
828 | if (AllChosenPairs.empty()) return false; |
829 | NumFusedOps += AllChosenPairs.size(); |
830 | |
831 | // A set of pairs has now been selected. It is now necessary to replace the |
832 | // paired instructions with vector instructions. For this procedure each |
833 | // operand must be replaced with a vector operand. This vector is formed |
834 | // by using build_vector on the old operands. The replaced values are then |
835 | // replaced with a vector_extract on the result. Subsequent optimization |
836 | // passes should coalesce the build/extract combinations. |
837 | |
838 | fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, |
839 | AllPairConnectionTypes, |
840 | AllConnectedPairs, AllConnectedPairDeps); |
841 | |
842 | // It is important to cleanup here so that future iterations of this |
843 | // function have less work to do. |
844 | (void) SimplifyInstructionsInBlock(&BB, DL, AA->getTargetLibraryInfo()); |
845 | return true; |
846 | } |
847 | |
848 | // This function returns true if the provided instruction is capable of being |
849 | // fused into a vector instruction. This determination is based only on the |
850 | // type and other attributes of the instruction. |
851 | bool BBVectorize::isInstVectorizable(Instruction *I, |
852 | bool &IsSimpleLoadStore) { |
853 | IsSimpleLoadStore = false; |
854 | |
855 | if (CallInst *C = dyn_cast<CallInst>(I)) { |
856 | if (!isVectorizableIntrinsic(C)) |
857 | return false; |
858 | } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { |
859 | // Vectorize simple loads if possbile: |
860 | IsSimpleLoadStore = L->isSimple(); |
861 | if (!IsSimpleLoadStore || !Config.VectorizeMemOps) |
862 | return false; |
863 | } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { |
864 | // Vectorize simple stores if possbile: |
865 | IsSimpleLoadStore = S->isSimple(); |
866 | if (!IsSimpleLoadStore || !Config.VectorizeMemOps) |
867 | return false; |
868 | } else if (CastInst *C = dyn_cast<CastInst>(I)) { |
869 | // We can vectorize casts, but not casts of pointer types, etc. |
870 | if (!Config.VectorizeCasts) |
871 | return false; |
872 | |
873 | Type *SrcTy = C->getSrcTy(); |
874 | if (!SrcTy->isSingleValueType()) |
875 | return false; |
876 | |
877 | Type *DestTy = C->getDestTy(); |
878 | if (!DestTy->isSingleValueType()) |
879 | return false; |
880 | } else if (isa<SelectInst>(I)) { |
881 | if (!Config.VectorizeSelect) |
882 | return false; |
883 | } else if (isa<CmpInst>(I)) { |
884 | if (!Config.VectorizeCmp) |
885 | return false; |
886 | } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { |
887 | if (!Config.VectorizeGEP) |
888 | return false; |
889 | |
890 | // Currently, vector GEPs exist only with one index. |
891 | if (G->getNumIndices() != 1) |
892 | return false; |
893 | } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || |
894 | isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { |
895 | return false; |
896 | } |
897 | |
898 | // We can't vectorize memory operations without target data |
899 | if (!DL && IsSimpleLoadStore) |
900 | return false; |
901 | |
902 | Type *T1, *T2; |
903 | getInstructionTypes(I, T1, T2); |
904 | |
905 | // Not every type can be vectorized... |
906 | if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || |
907 | !(VectorType::isValidElementType(T2) || T2->isVectorTy())) |
908 | return false; |
909 | |
910 | if (T1->getScalarSizeInBits() == 1) { |
911 | if (!Config.VectorizeBools) |
912 | return false; |
913 | } else { |
914 | if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) |
915 | return false; |
916 | } |
917 | |
918 | if (T2->getScalarSizeInBits() == 1) { |
919 | if (!Config.VectorizeBools) |
920 | return false; |
921 | } else { |
922 | if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) |
923 | return false; |
924 | } |
925 | |
926 | if (!Config.VectorizeFloats |
927 | && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) |
928 | return false; |
929 | |
930 | // Don't vectorize target-specific types. |
931 | if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) |
932 | return false; |
933 | if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) |
934 | return false; |
935 | |
936 | if ((!Config.VectorizePointers || !DL) && |
937 | (T1->getScalarType()->isPointerTy() || |
938 | T2->getScalarType()->isPointerTy())) |
939 | return false; |
940 | |
941 | if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || |
942 | T2->getPrimitiveSizeInBits() >= Config.VectorBits)) |
943 | return false; |
944 | |
945 | return true; |
946 | } |
947 | |
948 | // This function returns true if the two provided instructions are compatible |
949 | // (meaning that they can be fused into a vector instruction). This assumes |
950 | // that I has already been determined to be vectorizable and that J is not |
951 | // in the use dag of I. |
952 | bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, |
953 | bool IsSimpleLoadStore, bool NonPow2Len, |
954 | int &CostSavings, int &FixedOrder) { |
955 | DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << " <-> " << *J << "\n"; } } while (0) |
956 | " <-> " << *J << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << " <-> " << *J << "\n"; } } while (0); |
957 | |
958 | CostSavings = 0; |
959 | FixedOrder = 0; |
960 | |
961 | // Loads and stores can be merged if they have different alignments, |
962 | // but are otherwise the same. |
963 | if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | |
964 | (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0))) |
965 | return false; |
966 | |
967 | Type *IT1, *IT2, *JT1, *JT2; |
968 | getInstructionTypes(I, IT1, IT2); |
969 | getInstructionTypes(J, JT1, JT2); |
970 | unsigned MaxTypeBits = std::max( |
971 | IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), |
972 | IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); |
973 | if (!TTI && MaxTypeBits > Config.VectorBits) |
974 | return false; |
975 | |
976 | // FIXME: handle addsub-type operations! |
977 | |
978 | if (IsSimpleLoadStore) { |
979 | Value *IPtr, *JPtr; |
980 | unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; |
981 | int64_t OffsetInElmts = 0; |
982 | if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, |
983 | IAddressSpace, JAddressSpace, |
984 | OffsetInElmts) && abs64(OffsetInElmts) == 1) { |
985 | FixedOrder = (int) OffsetInElmts; |
986 | unsigned BottomAlignment = IAlignment; |
987 | if (OffsetInElmts < 0) BottomAlignment = JAlignment; |
988 | |
989 | Type *aTypeI = isa<StoreInst>(I) ? |
990 | cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); |
991 | Type *aTypeJ = isa<StoreInst>(J) ? |
992 | cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); |
993 | Type *VType = getVecTypeForPair(aTypeI, aTypeJ); |
994 | |
995 | if (Config.AlignedOnly) { |
996 | // An aligned load or store is possible only if the instruction |
997 | // with the lower offset has an alignment suitable for the |
998 | // vector type. |
999 | |
1000 | unsigned VecAlignment = DL->getPrefTypeAlignment(VType); |
1001 | if (BottomAlignment < VecAlignment) |
1002 | return false; |
1003 | } |
1004 | |
1005 | if (TTI) { |
1006 | unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, |
1007 | IAlignment, IAddressSpace); |
1008 | unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, |
1009 | JAlignment, JAddressSpace); |
1010 | unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, |
1011 | BottomAlignment, |
1012 | IAddressSpace); |
1013 | |
1014 | ICost += TTI->getAddressComputationCost(aTypeI); |
1015 | JCost += TTI->getAddressComputationCost(aTypeJ); |
1016 | VCost += TTI->getAddressComputationCost(VType); |
1017 | |
1018 | if (VCost > ICost + JCost) |
1019 | return false; |
1020 | |
1021 | // We don't want to fuse to a type that will be split, even |
1022 | // if the two input types will also be split and there is no other |
1023 | // associated cost. |
1024 | unsigned VParts = TTI->getNumberOfParts(VType); |
1025 | if (VParts > 1) |
1026 | return false; |
1027 | else if (!VParts && VCost == ICost + JCost) |
1028 | return false; |
1029 | |
1030 | CostSavings = ICost + JCost - VCost; |
1031 | } |
1032 | } else { |
1033 | return false; |
1034 | } |
1035 | } else if (TTI) { |
1036 | unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); |
1037 | unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); |
1038 | Type *VT1 = getVecTypeForPair(IT1, JT1), |
1039 | *VT2 = getVecTypeForPair(IT2, JT2); |
1040 | TargetTransformInfo::OperandValueKind Op1VK = |
1041 | TargetTransformInfo::OK_AnyValue; |
1042 | TargetTransformInfo::OperandValueKind Op2VK = |
1043 | TargetTransformInfo::OK_AnyValue; |
1044 | |
1045 | // On some targets (example X86) the cost of a vector shift may vary |
1046 | // depending on whether the second operand is a Uniform or |
1047 | // NonUniform Constant. |
1048 | switch (I->getOpcode()) { |
1049 | default : break; |
1050 | case Instruction::Shl: |
1051 | case Instruction::LShr: |
1052 | case Instruction::AShr: |
1053 | |
1054 | // If both I and J are scalar shifts by constant, then the |
1055 | // merged vector shift count would be either a constant splat value |
1056 | // or a non-uniform vector of constants. |
1057 | if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) { |
1058 | if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1))) |
1059 | Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue : |
1060 | TargetTransformInfo::OK_NonUniformConstantValue; |
1061 | } else { |
1062 | // Check for a splat of a constant or for a non uniform vector |
1063 | // of constants. |
1064 | Value *IOp = I->getOperand(1); |
1065 | Value *JOp = J->getOperand(1); |
1066 | if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) && |
1067 | (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) { |
1068 | Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; |
1069 | Constant *SplatValue = cast<Constant>(IOp)->getSplatValue(); |
1070 | if (SplatValue != nullptr && |
1071 | SplatValue == cast<Constant>(JOp)->getSplatValue()) |
1072 | Op2VK = TargetTransformInfo::OK_UniformConstantValue; |
1073 | } |
1074 | } |
1075 | } |
1076 | |
1077 | // Note that this procedure is incorrect for insert and extract element |
1078 | // instructions (because combining these often results in a shuffle), |
1079 | // but this cost is ignored (because insert and extract element |
1080 | // instructions are assigned a zero depth factor and are not really |
1081 | // fused in general). |
1082 | unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK); |
1083 | |
1084 | if (VCost > ICost + JCost) |
1085 | return false; |
1086 | |
1087 | // We don't want to fuse to a type that will be split, even |
1088 | // if the two input types will also be split and there is no other |
1089 | // associated cost. |
1090 | unsigned VParts1 = TTI->getNumberOfParts(VT1), |
1091 | VParts2 = TTI->getNumberOfParts(VT2); |
1092 | if (VParts1 > 1 || VParts2 > 1) |
1093 | return false; |
1094 | else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) |
1095 | return false; |
1096 | |
1097 | CostSavings = ICost + JCost - VCost; |
1098 | } |
1099 | |
1100 | // The powi,ctlz,cttz intrinsics are special because only the first |
1101 | // argument is vectorized, the second arguments must be equal. |
1102 | CallInst *CI = dyn_cast<CallInst>(I); |
1103 | Function *FI; |
1104 | if (CI && (FI = CI->getCalledFunction())) { |
1105 | Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); |
1106 | if (IID == Intrinsic::powi || IID == Intrinsic::ctlz || |
1107 | IID == Intrinsic::cttz) { |
1108 | Value *A1I = CI->getArgOperand(1), |
1109 | *A1J = cast<CallInst>(J)->getArgOperand(1); |
1110 | const SCEV *A1ISCEV = SE->getSCEV(A1I), |
1111 | *A1JSCEV = SE->getSCEV(A1J); |
1112 | return (A1ISCEV == A1JSCEV); |
1113 | } |
1114 | |
1115 | if (IID && TTI) { |
1116 | SmallVector<Type*, 4> Tys; |
1117 | for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) |
1118 | Tys.push_back(CI->getArgOperand(i)->getType()); |
1119 | unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys); |
1120 | |
1121 | Tys.clear(); |
1122 | CallInst *CJ = cast<CallInst>(J); |
1123 | for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) |
1124 | Tys.push_back(CJ->getArgOperand(i)->getType()); |
1125 | unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys); |
1126 | |
1127 | Tys.clear(); |
1128 | assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&((CI->getNumArgOperands() == CJ->getNumArgOperands() && "Intrinsic argument counts differ") ? static_cast<void> (0) : __assert_fail ("CI->getNumArgOperands() == CJ->getNumArgOperands() && \"Intrinsic argument counts differ\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 1129, __PRETTY_FUNCTION__)) |
1129 | "Intrinsic argument counts differ")((CI->getNumArgOperands() == CJ->getNumArgOperands() && "Intrinsic argument counts differ") ? static_cast<void> (0) : __assert_fail ("CI->getNumArgOperands() == CJ->getNumArgOperands() && \"Intrinsic argument counts differ\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 1129, __PRETTY_FUNCTION__)); |
1130 | for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { |
1131 | if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || |
1132 | IID == Intrinsic::cttz) && i == 1) |
1133 | Tys.push_back(CI->getArgOperand(i)->getType()); |
1134 | else |
1135 | Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), |
1136 | CJ->getArgOperand(i)->getType())); |
1137 | } |
1138 | |
1139 | Type *RetTy = getVecTypeForPair(IT1, JT1); |
1140 | unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys); |
1141 | |
1142 | if (VCost > ICost + JCost) |
1143 | return false; |
1144 | |
1145 | // We don't want to fuse to a type that will be split, even |
1146 | // if the two input types will also be split and there is no other |
1147 | // associated cost. |
1148 | unsigned RetParts = TTI->getNumberOfParts(RetTy); |
1149 | if (RetParts > 1) |
1150 | return false; |
1151 | else if (!RetParts && VCost == ICost + JCost) |
1152 | return false; |
1153 | |
1154 | for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { |
1155 | if (!Tys[i]->isVectorTy()) |
1156 | continue; |
1157 | |
1158 | unsigned NumParts = TTI->getNumberOfParts(Tys[i]); |
1159 | if (NumParts > 1) |
1160 | return false; |
1161 | else if (!NumParts && VCost == ICost + JCost) |
1162 | return false; |
1163 | } |
1164 | |
1165 | CostSavings = ICost + JCost - VCost; |
1166 | } |
1167 | } |
1168 | |
1169 | return true; |
1170 | } |
1171 | |
1172 | // Figure out whether or not J uses I and update the users and write-set |
1173 | // structures associated with I. Specifically, Users represents the set of |
1174 | // instructions that depend on I. WriteSet represents the set |
1175 | // of memory locations that are dependent on I. If UpdateUsers is true, |
1176 | // and J uses I, then Users is updated to contain J and WriteSet is updated |
1177 | // to contain any memory locations to which J writes. The function returns |
1178 | // true if J uses I. By default, alias analysis is used to determine |
1179 | // whether J reads from memory that overlaps with a location in WriteSet. |
1180 | // If LoadMoveSet is not null, then it is a previously-computed map |
1181 | // where the key is the memory-based user instruction and the value is |
1182 | // the instruction to be compared with I. So, if LoadMoveSet is provided, |
1183 | // then the alias analysis is not used. This is necessary because this |
1184 | // function is called during the process of moving instructions during |
1185 | // vectorization and the results of the alias analysis are not stable during |
1186 | // that process. |
1187 | bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, |
1188 | AliasSetTracker &WriteSet, Instruction *I, |
1189 | Instruction *J, bool UpdateUsers, |
1190 | DenseSet<ValuePair> *LoadMoveSetPairs) { |
1191 | bool UsesI = false; |
1192 | |
1193 | // This instruction may already be marked as a user due, for example, to |
1194 | // being a member of a selected pair. |
1195 | if (Users.count(J)) |
1196 | UsesI = true; |
1197 | |
1198 | if (!UsesI) |
1199 | for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); |
1200 | JU != JE; ++JU) { |
1201 | Value *V = *JU; |
1202 | if (I == V || Users.count(V)) { |
1203 | UsesI = true; |
1204 | break; |
1205 | } |
1206 | } |
1207 | if (!UsesI && J->mayReadFromMemory()) { |
1208 | if (LoadMoveSetPairs) { |
1209 | UsesI = LoadMoveSetPairs->count(ValuePair(J, I)); |
1210 | } else { |
1211 | for (AliasSetTracker::iterator W = WriteSet.begin(), |
1212 | WE = WriteSet.end(); W != WE; ++W) { |
1213 | if (W->aliasesUnknownInst(J, *AA)) { |
1214 | UsesI = true; |
1215 | break; |
1216 | } |
1217 | } |
1218 | } |
1219 | } |
1220 | |
1221 | if (UsesI && UpdateUsers) { |
1222 | if (J->mayWriteToMemory()) WriteSet.add(J); |
1223 | Users.insert(J); |
1224 | } |
1225 | |
1226 | return UsesI; |
1227 | } |
1228 | |
1229 | // This function iterates over all instruction pairs in the provided |
1230 | // basic block and collects all candidate pairs for vectorization. |
1231 | bool BBVectorize::getCandidatePairs(BasicBlock &BB, |
1232 | BasicBlock::iterator &Start, |
1233 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1234 | DenseSet<ValuePair> &FixedOrderPairs, |
1235 | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
1236 | std::vector<Value *> &PairableInsts, bool NonPow2Len) { |
1237 | size_t TotalPairs = 0; |
1238 | BasicBlock::iterator E = BB.end(); |
1239 | if (Start == E) return false; |
1240 | |
1241 | bool ShouldContinue = false, IAfterStart = false; |
1242 | for (BasicBlock::iterator I = Start++; I != E; ++I) { |
1243 | if (I == Start) IAfterStart = true; |
1244 | |
1245 | bool IsSimpleLoadStore; |
1246 | if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; |
1247 | |
1248 | // Look for an instruction with which to pair instruction *I... |
1249 | DenseSet<Value *> Users; |
1250 | AliasSetTracker WriteSet(*AA); |
1251 | if (I->mayWriteToMemory()) WriteSet.add(I); |
1252 | |
1253 | bool JAfterStart = IAfterStart; |
1254 | BasicBlock::iterator J = std::next(I); |
1255 | for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { |
1256 | if (J == Start) JAfterStart = true; |
1257 | |
1258 | // Determine if J uses I, if so, exit the loop. |
1259 | bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); |
1260 | if (Config.FastDep) { |
1261 | // Note: For this heuristic to be effective, independent operations |
1262 | // must tend to be intermixed. This is likely to be true from some |
1263 | // kinds of grouped loop unrolling (but not the generic LLVM pass), |
1264 | // but otherwise may require some kind of reordering pass. |
1265 | |
1266 | // When using fast dependency analysis, |
1267 | // stop searching after first use: |
1268 | if (UsesI) break; |
1269 | } else { |
1270 | if (UsesI) continue; |
1271 | } |
1272 | |
1273 | // J does not use I, and comes before the first use of I, so it can be |
1274 | // merged with I if the instructions are compatible. |
1275 | int CostSavings, FixedOrder; |
1276 | if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, |
1277 | CostSavings, FixedOrder)) continue; |
1278 | |
1279 | // J is a candidate for merging with I. |
1280 | if (!PairableInsts.size() || |
1281 | PairableInsts[PairableInsts.size()-1] != I) { |
1282 | PairableInsts.push_back(I); |
1283 | } |
1284 | |
1285 | CandidatePairs[I].push_back(J); |
1286 | ++TotalPairs; |
1287 | if (TTI) |
1288 | CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), |
1289 | CostSavings)); |
1290 | |
1291 | if (FixedOrder == 1) |
1292 | FixedOrderPairs.insert(ValuePair(I, J)); |
1293 | else if (FixedOrder == -1) |
1294 | FixedOrderPairs.insert(ValuePair(J, I)); |
1295 | |
1296 | // The next call to this function must start after the last instruction |
1297 | // selected during this invocation. |
1298 | if (JAfterStart) { |
1299 | Start = std::next(J); |
1300 | IAfterStart = JAfterStart = false; |
1301 | } |
1302 | |
1303 | DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " << *I << " <-> " << *J << " (cost savings: " << CostSavings << ")\n"; } } while (0) |
1304 | << *I << " <-> " << *J << " (cost savings: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " << *I << " <-> " << *J << " (cost savings: " << CostSavings << ")\n"; } } while (0) |
1305 | CostSavings << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " << *I << " <-> " << *J << " (cost savings: " << CostSavings << ")\n"; } } while (0); |
1306 | |
1307 | // If we have already found too many pairs, break here and this function |
1308 | // will be called again starting after the last instruction selected |
1309 | // during this invocation. |
1310 | if (PairableInsts.size() >= Config.MaxInsts || |
1311 | TotalPairs >= Config.MaxPairs) { |
1312 | ShouldContinue = true; |
1313 | break; |
1314 | } |
1315 | } |
1316 | |
1317 | if (ShouldContinue) |
1318 | break; |
1319 | } |
1320 | |
1321 | DEBUG(dbgs() << "BBV: found " << PairableInsts.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: found " << PairableInsts .size() << " instructions with candidate pairs\n"; } } while (0) |
1322 | << " instructions with candidate pairs\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: found " << PairableInsts .size() << " instructions with candidate pairs\n"; } } while (0); |
1323 | |
1324 | return ShouldContinue; |
1325 | } |
1326 | |
1327 | // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that |
1328 | // it looks for pairs such that both members have an input which is an |
1329 | // output of PI or PJ. |
1330 | void BBVectorize::computePairsConnectedTo( |
1331 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1332 | DenseSet<ValuePair> &CandidatePairsSet, |
1333 | std::vector<Value *> &PairableInsts, |
1334 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
1335 | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
1336 | ValuePair P) { |
1337 | StoreInst *SI, *SJ; |
1338 | |
1339 | // For each possible pairing for this variable, look at the uses of |
1340 | // the first value... |
1341 | for (Value::user_iterator I = P.first->user_begin(), |
1342 | E = P.first->user_end(); |
1343 | I != E; ++I) { |
1344 | User *UI = *I; |
1345 | if (isa<LoadInst>(UI)) { |
1346 | // A pair cannot be connected to a load because the load only takes one |
1347 | // operand (the address) and it is a scalar even after vectorization. |
1348 | continue; |
1349 | } else if ((SI = dyn_cast<StoreInst>(UI)) && |
1350 | P.first == SI->getPointerOperand()) { |
1351 | // Similarly, a pair cannot be connected to a store through its |
1352 | // pointer operand. |
1353 | continue; |
1354 | } |
1355 | |
1356 | // For each use of the first variable, look for uses of the second |
1357 | // variable... |
1358 | for (User *UJ : P.second->users()) { |
1359 | if ((SJ = dyn_cast<StoreInst>(UJ)) && |
1360 | P.second == SJ->getPointerOperand()) |
1361 | continue; |
1362 | |
1363 | // Look for <I, J>: |
1364 | if (CandidatePairsSet.count(ValuePair(UI, UJ))) { |
1365 | VPPair VP(P, ValuePair(UI, UJ)); |
1366 | ConnectedPairs[VP.first].push_back(VP.second); |
1367 | PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); |
1368 | } |
1369 | |
1370 | // Look for <J, I>: |
1371 | if (CandidatePairsSet.count(ValuePair(UJ, UI))) { |
1372 | VPPair VP(P, ValuePair(UJ, UI)); |
1373 | ConnectedPairs[VP.first].push_back(VP.second); |
1374 | PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); |
1375 | } |
1376 | } |
1377 | |
1378 | if (Config.SplatBreaksChain) continue; |
1379 | // Look for cases where just the first value in the pair is used by |
1380 | // both members of another pair (splatting). |
1381 | for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) { |
1382 | User *UJ = *J; |
1383 | if ((SJ = dyn_cast<StoreInst>(UJ)) && |
1384 | P.first == SJ->getPointerOperand()) |
1385 | continue; |
1386 | |
1387 | if (CandidatePairsSet.count(ValuePair(UI, UJ))) { |
1388 | VPPair VP(P, ValuePair(UI, UJ)); |
1389 | ConnectedPairs[VP.first].push_back(VP.second); |
1390 | PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); |
1391 | } |
1392 | } |
1393 | } |
1394 | |
1395 | if (Config.SplatBreaksChain) return; |
1396 | // Look for cases where just the second value in the pair is used by |
1397 | // both members of another pair (splatting). |
1398 | for (Value::user_iterator I = P.second->user_begin(), |
1399 | E = P.second->user_end(); |
1400 | I != E; ++I) { |
1401 | User *UI = *I; |
1402 | if (isa<LoadInst>(UI)) |
1403 | continue; |
1404 | else if ((SI = dyn_cast<StoreInst>(UI)) && |
1405 | P.second == SI->getPointerOperand()) |
1406 | continue; |
1407 | |
1408 | for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) { |
1409 | User *UJ = *J; |
1410 | if ((SJ = dyn_cast<StoreInst>(UJ)) && |
1411 | P.second == SJ->getPointerOperand()) |
1412 | continue; |
1413 | |
1414 | if (CandidatePairsSet.count(ValuePair(UI, UJ))) { |
1415 | VPPair VP(P, ValuePair(UI, UJ)); |
1416 | ConnectedPairs[VP.first].push_back(VP.second); |
1417 | PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); |
1418 | } |
1419 | } |
1420 | } |
1421 | } |
1422 | |
1423 | // This function figures out which pairs are connected. Two pairs are |
1424 | // connected if some output of the first pair forms an input to both members |
1425 | // of the second pair. |
1426 | void BBVectorize::computeConnectedPairs( |
1427 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1428 | DenseSet<ValuePair> &CandidatePairsSet, |
1429 | std::vector<Value *> &PairableInsts, |
1430 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
1431 | DenseMap<VPPair, unsigned> &PairConnectionTypes) { |
1432 | for (std::vector<Value *>::iterator PI = PairableInsts.begin(), |
1433 | PE = PairableInsts.end(); PI != PE; ++PI) { |
1434 | DenseMap<Value *, std::vector<Value *> >::iterator PP = |
1435 | CandidatePairs.find(*PI); |
1436 | if (PP == CandidatePairs.end()) |
1437 | continue; |
1438 | |
1439 | for (std::vector<Value *>::iterator P = PP->second.begin(), |
1440 | E = PP->second.end(); P != E; ++P) |
1441 | computePairsConnectedTo(CandidatePairs, CandidatePairsSet, |
1442 | PairableInsts, ConnectedPairs, |
1443 | PairConnectionTypes, ValuePair(*PI, *P)); |
1444 | } |
1445 | |
1446 | DEBUG(size_t TotalPairs = 0;do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { size_t TotalPairs = 0; for (DenseMap<ValuePair , std::vector<ValuePair> >::iterator I = ConnectedPairs .begin(), IE = ConnectedPairs.end(); I != IE; ++I) TotalPairs += I->second.size(); dbgs() << "BBV: found " << TotalPairs << " pair connections.\n"; } } while (0) |
1447 | for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { size_t TotalPairs = 0; for (DenseMap<ValuePair , std::vector<ValuePair> >::iterator I = ConnectedPairs .begin(), IE = ConnectedPairs.end(); I != IE; ++I) TotalPairs += I->second.size(); dbgs() << "BBV: found " << TotalPairs << " pair connections.\n"; } } while (0) |
1448 | ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { size_t TotalPairs = 0; for (DenseMap<ValuePair , std::vector<ValuePair> >::iterator I = ConnectedPairs .begin(), IE = ConnectedPairs.end(); I != IE; ++I) TotalPairs += I->second.size(); dbgs() << "BBV: found " << TotalPairs << " pair connections.\n"; } } while (0) |
1449 | TotalPairs += I->second.size();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { size_t TotalPairs = 0; for (DenseMap<ValuePair , std::vector<ValuePair> >::iterator I = ConnectedPairs .begin(), IE = ConnectedPairs.end(); I != IE; ++I) TotalPairs += I->second.size(); dbgs() << "BBV: found " << TotalPairs << " pair connections.\n"; } } while (0) |
1450 | dbgs() << "BBV: found " << TotalPairsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { size_t TotalPairs = 0; for (DenseMap<ValuePair , std::vector<ValuePair> >::iterator I = ConnectedPairs .begin(), IE = ConnectedPairs.end(); I != IE; ++I) TotalPairs += I->second.size(); dbgs() << "BBV: found " << TotalPairs << " pair connections.\n"; } } while (0) |
1451 | << " pair connections.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { size_t TotalPairs = 0; for (DenseMap<ValuePair , std::vector<ValuePair> >::iterator I = ConnectedPairs .begin(), IE = ConnectedPairs.end(); I != IE; ++I) TotalPairs += I->second.size(); dbgs() << "BBV: found " << TotalPairs << " pair connections.\n"; } } while (0); |
1452 | } |
1453 | |
1454 | // This function builds a set of use tuples such that <A, B> is in the set |
1455 | // if B is in the use dag of A. If B is in the use dag of A, then B |
1456 | // depends on the output of A. |
1457 | void BBVectorize::buildDepMap( |
1458 | BasicBlock &BB, |
1459 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1460 | std::vector<Value *> &PairableInsts, |
1461 | DenseSet<ValuePair> &PairableInstUsers) { |
1462 | DenseSet<Value *> IsInPair; |
1463 | for (DenseMap<Value *, std::vector<Value *> >::iterator C = |
1464 | CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) { |
1465 | IsInPair.insert(C->first); |
1466 | IsInPair.insert(C->second.begin(), C->second.end()); |
1467 | } |
1468 | |
1469 | // Iterate through the basic block, recording all users of each |
1470 | // pairable instruction. |
1471 | |
1472 | BasicBlock::iterator E = BB.end(), EL = |
1473 | BasicBlock::iterator(cast<Instruction>(PairableInsts.back())); |
1474 | for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { |
1475 | if (IsInPair.find(I) == IsInPair.end()) continue; |
1476 | |
1477 | DenseSet<Value *> Users; |
1478 | AliasSetTracker WriteSet(*AA); |
1479 | if (I->mayWriteToMemory()) WriteSet.add(I); |
1480 | |
1481 | for (BasicBlock::iterator J = std::next(I); J != E; ++J) { |
1482 | (void) trackUsesOfI(Users, WriteSet, I, J); |
1483 | |
1484 | if (J == EL) |
1485 | break; |
1486 | } |
1487 | |
1488 | for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); |
1489 | U != E; ++U) { |
1490 | if (IsInPair.find(*U) == IsInPair.end()) continue; |
1491 | PairableInstUsers.insert(ValuePair(I, *U)); |
1492 | } |
1493 | |
1494 | if (I == EL) |
1495 | break; |
1496 | } |
1497 | } |
1498 | |
1499 | // Returns true if an input to pair P is an output of pair Q and also an |
1500 | // input of pair Q is an output of pair P. If this is the case, then these |
1501 | // two pairs cannot be simultaneously fused. |
1502 | bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, |
1503 | DenseSet<ValuePair> &PairableInstUsers, |
1504 | DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap, |
1505 | DenseSet<VPPair> *PairableInstUserPairSet) { |
1506 | // Two pairs are in conflict if they are mutual Users of eachother. |
1507 | bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || |
1508 | PairableInstUsers.count(ValuePair(P.first, Q.second)) || |
1509 | PairableInstUsers.count(ValuePair(P.second, Q.first)) || |
1510 | PairableInstUsers.count(ValuePair(P.second, Q.second)); |
1511 | bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || |
1512 | PairableInstUsers.count(ValuePair(Q.first, P.second)) || |
1513 | PairableInstUsers.count(ValuePair(Q.second, P.first)) || |
1514 | PairableInstUsers.count(ValuePair(Q.second, P.second)); |
1515 | if (PairableInstUserMap) { |
1516 | // FIXME: The expensive part of the cycle check is not so much the cycle |
1517 | // check itself but this edge insertion procedure. This needs some |
1518 | // profiling and probably a different data structure. |
1519 | if (PUsesQ) { |
1520 | if (PairableInstUserPairSet->insert(VPPair(Q, P)).second) |
1521 | (*PairableInstUserMap)[Q].push_back(P); |
1522 | } |
1523 | if (QUsesP) { |
1524 | if (PairableInstUserPairSet->insert(VPPair(P, Q)).second) |
1525 | (*PairableInstUserMap)[P].push_back(Q); |
1526 | } |
1527 | } |
1528 | |
1529 | return (QUsesP && PUsesQ); |
1530 | } |
1531 | |
1532 | // This function walks the use graph of current pairs to see if, starting |
1533 | // from P, the walk returns to P. |
1534 | bool BBVectorize::pairWillFormCycle(ValuePair P, |
1535 | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, |
1536 | DenseSet<ValuePair> &CurrentPairs) { |
1537 | DEBUG(if (DebugCycleCheck)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugCycleCheck) dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " << *P.second << "\n"; } } while (0) |
1538 | dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugCycleCheck) dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " << *P.second << "\n"; } } while (0) |
1539 | << *P.second << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugCycleCheck) dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " << *P.second << "\n"; } } while (0); |
1540 | // A lookup table of visisted pairs is kept because the PairableInstUserMap |
1541 | // contains non-direct associations. |
1542 | DenseSet<ValuePair> Visited; |
1543 | SmallVector<ValuePair, 32> Q; |
1544 | // General depth-first post-order traversal: |
1545 | Q.push_back(P); |
1546 | do { |
1547 | ValuePair QTop = Q.pop_back_val(); |
1548 | Visited.insert(QTop); |
1549 | |
1550 | DEBUG(if (DebugCycleCheck)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugCycleCheck) dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " << *QTop.second << "\n"; } } while (0) |
1551 | dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugCycleCheck) dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " << *QTop.second << "\n"; } } while (0) |
1552 | << *QTop.second << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugCycleCheck) dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " << *QTop.second << "\n"; } } while (0); |
1553 | DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = |
1554 | PairableInstUserMap.find(QTop); |
1555 | if (QQ == PairableInstUserMap.end()) |
1556 | continue; |
1557 | |
1558 | for (std::vector<ValuePair>::iterator C = QQ->second.begin(), |
1559 | CE = QQ->second.end(); C != CE; ++C) { |
1560 | if (*C == P) { |
1561 | DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: rejected to prevent non-trivial cycle formation: " << QTop.first << " <-> " << C->second << "\n"; } } while (0) |
1562 | << "BBV: rejected to prevent non-trivial cycle formation: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: rejected to prevent non-trivial cycle formation: " << QTop.first << " <-> " << C->second << "\n"; } } while (0) |
1563 | << QTop.first << " <-> " << C->second << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: rejected to prevent non-trivial cycle formation: " << QTop.first << " <-> " << C->second << "\n"; } } while (0); |
1564 | return true; |
1565 | } |
1566 | |
1567 | if (CurrentPairs.count(*C) && !Visited.count(*C)) |
1568 | Q.push_back(*C); |
1569 | } |
1570 | } while (!Q.empty()); |
1571 | |
1572 | return false; |
1573 | } |
1574 | |
1575 | // This function builds the initial dag of connected pairs with the |
1576 | // pair J at the root. |
1577 | void BBVectorize::buildInitialDAGFor( |
1578 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1579 | DenseSet<ValuePair> &CandidatePairsSet, |
1580 | std::vector<Value *> &PairableInsts, |
1581 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
1582 | DenseSet<ValuePair> &PairableInstUsers, |
1583 | DenseMap<Value *, Value *> &ChosenPairs, |
1584 | DenseMap<ValuePair, size_t> &DAG, ValuePair J) { |
1585 | // Each of these pairs is viewed as the root node of a DAG. The DAG |
1586 | // is then walked (depth-first). As this happens, we keep track of |
1587 | // the pairs that compose the DAG and the maximum depth of the DAG. |
1588 | SmallVector<ValuePairWithDepth, 32> Q; |
1589 | // General depth-first post-order traversal: |
1590 | Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); |
1591 | do { |
1592 | ValuePairWithDepth QTop = Q.back(); |
1593 | |
1594 | // Push each child onto the queue: |
1595 | bool MoreChildren = false; |
1596 | size_t MaxChildDepth = QTop.second; |
1597 | DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = |
1598 | ConnectedPairs.find(QTop.first); |
1599 | if (QQ != ConnectedPairs.end()) |
1600 | for (std::vector<ValuePair>::iterator k = QQ->second.begin(), |
1601 | ke = QQ->second.end(); k != ke; ++k) { |
1602 | // Make sure that this child pair is still a candidate: |
1603 | if (CandidatePairsSet.count(*k)) { |
1604 | DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k); |
1605 | if (C == DAG.end()) { |
1606 | size_t d = getDepthFactor(k->first); |
1607 | Q.push_back(ValuePairWithDepth(*k, QTop.second+d)); |
1608 | MoreChildren = true; |
1609 | } else { |
1610 | MaxChildDepth = std::max(MaxChildDepth, C->second); |
1611 | } |
1612 | } |
1613 | } |
1614 | |
1615 | if (!MoreChildren) { |
1616 | // Record the current pair as part of the DAG: |
1617 | DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); |
1618 | Q.pop_back(); |
1619 | } |
1620 | } while (!Q.empty()); |
1621 | } |
1622 | |
1623 | // Given some initial dag, prune it by removing conflicting pairs (pairs |
1624 | // that cannot be simultaneously chosen for vectorization). |
1625 | void BBVectorize::pruneDAGFor( |
1626 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1627 | std::vector<Value *> &PairableInsts, |
1628 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
1629 | DenseSet<ValuePair> &PairableInstUsers, |
1630 | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, |
1631 | DenseSet<VPPair> &PairableInstUserPairSet, |
1632 | DenseMap<Value *, Value *> &ChosenPairs, |
1633 | DenseMap<ValuePair, size_t> &DAG, |
1634 | DenseSet<ValuePair> &PrunedDAG, ValuePair J, |
1635 | bool UseCycleCheck) { |
1636 | SmallVector<ValuePairWithDepth, 32> Q; |
1637 | // General depth-first post-order traversal: |
1638 | Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); |
1639 | do { |
1640 | ValuePairWithDepth QTop = Q.pop_back_val(); |
1641 | PrunedDAG.insert(QTop.first); |
1642 | |
1643 | // Visit each child, pruning as necessary... |
1644 | SmallVector<ValuePairWithDepth, 8> BestChildren; |
1645 | DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = |
1646 | ConnectedPairs.find(QTop.first); |
1647 | if (QQ == ConnectedPairs.end()) |
1648 | continue; |
1649 | |
1650 | for (std::vector<ValuePair>::iterator K = QQ->second.begin(), |
1651 | KE = QQ->second.end(); K != KE; ++K) { |
1652 | DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K); |
1653 | if (C == DAG.end()) continue; |
1654 | |
1655 | // This child is in the DAG, now we need to make sure it is the |
1656 | // best of any conflicting children. There could be multiple |
1657 | // conflicting children, so first, determine if we're keeping |
1658 | // this child, then delete conflicting children as necessary. |
1659 | |
1660 | // It is also necessary to guard against pairing-induced |
1661 | // dependencies. Consider instructions a .. x .. y .. b |
1662 | // such that (a,b) are to be fused and (x,y) are to be fused |
1663 | // but a is an input to x and b is an output from y. This |
1664 | // means that y cannot be moved after b but x must be moved |
1665 | // after b for (a,b) to be fused. In other words, after |
1666 | // fusing (a,b) we have y .. a/b .. x where y is an input |
1667 | // to a/b and x is an output to a/b: x and y can no longer |
1668 | // be legally fused. To prevent this condition, we must |
1669 | // make sure that a child pair added to the DAG is not |
1670 | // both an input and output of an already-selected pair. |
1671 | |
1672 | // Pairing-induced dependencies can also form from more complicated |
1673 | // cycles. The pair vs. pair conflicts are easy to check, and so |
1674 | // that is done explicitly for "fast rejection", and because for |
1675 | // child vs. child conflicts, we may prefer to keep the current |
1676 | // pair in preference to the already-selected child. |
1677 | DenseSet<ValuePair> CurrentPairs; |
1678 | |
1679 | bool CanAdd = true; |
1680 | for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 |
1681 | = BestChildren.begin(), E2 = BestChildren.end(); |
1682 | C2 != E2; ++C2) { |
1683 | if (C2->first.first == C->first.first || |
1684 | C2->first.first == C->first.second || |
1685 | C2->first.second == C->first.first || |
1686 | C2->first.second == C->first.second || |
1687 | pairsConflict(C2->first, C->first, PairableInstUsers, |
1688 | UseCycleCheck ? &PairableInstUserMap : nullptr, |
1689 | UseCycleCheck ? &PairableInstUserPairSet |
1690 | : nullptr)) { |
1691 | if (C2->second >= C->second) { |
1692 | CanAdd = false; |
1693 | break; |
1694 | } |
1695 | |
1696 | CurrentPairs.insert(C2->first); |
1697 | } |
1698 | } |
1699 | if (!CanAdd) continue; |
1700 | |
1701 | // Even worse, this child could conflict with another node already |
1702 | // selected for the DAG. If that is the case, ignore this child. |
1703 | for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(), |
1704 | E2 = PrunedDAG.end(); T != E2; ++T) { |
1705 | if (T->first == C->first.first || |
1706 | T->first == C->first.second || |
1707 | T->second == C->first.first || |
1708 | T->second == C->first.second || |
1709 | pairsConflict(*T, C->first, PairableInstUsers, |
1710 | UseCycleCheck ? &PairableInstUserMap : nullptr, |
1711 | UseCycleCheck ? &PairableInstUserPairSet |
1712 | : nullptr)) { |
1713 | CanAdd = false; |
1714 | break; |
1715 | } |
1716 | |
1717 | CurrentPairs.insert(*T); |
1718 | } |
1719 | if (!CanAdd) continue; |
1720 | |
1721 | // And check the queue too... |
1722 | for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(), |
1723 | E2 = Q.end(); C2 != E2; ++C2) { |
1724 | if (C2->first.first == C->first.first || |
1725 | C2->first.first == C->first.second || |
1726 | C2->first.second == C->first.first || |
1727 | C2->first.second == C->first.second || |
1728 | pairsConflict(C2->first, C->first, PairableInstUsers, |
1729 | UseCycleCheck ? &PairableInstUserMap : nullptr, |
1730 | UseCycleCheck ? &PairableInstUserPairSet |
1731 | : nullptr)) { |
1732 | CanAdd = false; |
1733 | break; |
1734 | } |
1735 | |
1736 | CurrentPairs.insert(C2->first); |
1737 | } |
1738 | if (!CanAdd) continue; |
1739 | |
1740 | // Last but not least, check for a conflict with any of the |
1741 | // already-chosen pairs. |
1742 | for (DenseMap<Value *, Value *>::iterator C2 = |
1743 | ChosenPairs.begin(), E2 = ChosenPairs.end(); |
1744 | C2 != E2; ++C2) { |
1745 | if (pairsConflict(*C2, C->first, PairableInstUsers, |
1746 | UseCycleCheck ? &PairableInstUserMap : nullptr, |
1747 | UseCycleCheck ? &PairableInstUserPairSet |
1748 | : nullptr)) { |
1749 | CanAdd = false; |
1750 | break; |
1751 | } |
1752 | |
1753 | CurrentPairs.insert(*C2); |
1754 | } |
1755 | if (!CanAdd) continue; |
1756 | |
1757 | // To check for non-trivial cycles formed by the addition of the |
1758 | // current pair we've formed a list of all relevant pairs, now use a |
1759 | // graph walk to check for a cycle. We start from the current pair and |
1760 | // walk the use dag to see if we again reach the current pair. If we |
1761 | // do, then the current pair is rejected. |
1762 | |
1763 | // FIXME: It may be more efficient to use a topological-ordering |
1764 | // algorithm to improve the cycle check. This should be investigated. |
1765 | if (UseCycleCheck && |
1766 | pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) |
1767 | continue; |
1768 | |
1769 | // This child can be added, but we may have chosen it in preference |
1770 | // to an already-selected child. Check for this here, and if a |
1771 | // conflict is found, then remove the previously-selected child |
1772 | // before adding this one in its place. |
1773 | for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 |
1774 | = BestChildren.begin(); C2 != BestChildren.end();) { |
1775 | if (C2->first.first == C->first.first || |
1776 | C2->first.first == C->first.second || |
1777 | C2->first.second == C->first.first || |
1778 | C2->first.second == C->first.second || |
1779 | pairsConflict(C2->first, C->first, PairableInstUsers)) |
1780 | C2 = BestChildren.erase(C2); |
1781 | else |
1782 | ++C2; |
1783 | } |
1784 | |
1785 | BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); |
1786 | } |
1787 | |
1788 | for (SmallVectorImpl<ValuePairWithDepth>::iterator C |
1789 | = BestChildren.begin(), E2 = BestChildren.end(); |
1790 | C != E2; ++C) { |
1791 | size_t DepthF = getDepthFactor(C->first.first); |
1792 | Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); |
1793 | } |
1794 | } while (!Q.empty()); |
1795 | } |
1796 | |
1797 | // This function finds the best dag of mututally-compatible connected |
1798 | // pairs, given the choice of root pairs as an iterator range. |
1799 | void BBVectorize::findBestDAGFor( |
1800 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1801 | DenseSet<ValuePair> &CandidatePairsSet, |
1802 | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
1803 | std::vector<Value *> &PairableInsts, |
1804 | DenseSet<ValuePair> &FixedOrderPairs, |
1805 | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
1806 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
1807 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, |
1808 | DenseSet<ValuePair> &PairableInstUsers, |
1809 | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, |
1810 | DenseSet<VPPair> &PairableInstUserPairSet, |
1811 | DenseMap<Value *, Value *> &ChosenPairs, |
1812 | DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, |
1813 | int &BestEffSize, Value *II, std::vector<Value *>&JJ, |
1814 | bool UseCycleCheck) { |
1815 | for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end(); |
1816 | J != JE; ++J) { |
1817 | ValuePair IJ(II, *J); |
1818 | if (!CandidatePairsSet.count(IJ)) |
1819 | continue; |
1820 | |
1821 | // Before going any further, make sure that this pair does not |
1822 | // conflict with any already-selected pairs (see comment below |
1823 | // near the DAG pruning for more details). |
1824 | DenseSet<ValuePair> ChosenPairSet; |
1825 | bool DoesConflict = false; |
1826 | for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), |
1827 | E = ChosenPairs.end(); C != E; ++C) { |
1828 | if (pairsConflict(*C, IJ, PairableInstUsers, |
1829 | UseCycleCheck ? &PairableInstUserMap : nullptr, |
1830 | UseCycleCheck ? &PairableInstUserPairSet : nullptr)) { |
1831 | DoesConflict = true; |
1832 | break; |
1833 | } |
1834 | |
1835 | ChosenPairSet.insert(*C); |
1836 | } |
1837 | if (DoesConflict) continue; |
1838 | |
1839 | if (UseCycleCheck && |
1840 | pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet)) |
1841 | continue; |
1842 | |
1843 | DenseMap<ValuePair, size_t> DAG; |
1844 | buildInitialDAGFor(CandidatePairs, CandidatePairsSet, |
1845 | PairableInsts, ConnectedPairs, |
1846 | PairableInstUsers, ChosenPairs, DAG, IJ); |
1847 | |
1848 | // Because we'll keep the child with the largest depth, the largest |
1849 | // depth is still the same in the unpruned DAG. |
1850 | size_t MaxDepth = DAG.lookup(IJ); |
1851 | |
1852 | DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {" << *IJ.first << " <-> " << *IJ.second << "} of depth " << MaxDepth << " and size " << DAG.size() << "\n"; } } while (0) |
1853 | << *IJ.first << " <-> " << *IJ.second << "} of depth " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {" << *IJ.first << " <-> " << *IJ.second << "} of depth " << MaxDepth << " and size " << DAG.size() << "\n"; } } while (0) |
1854 | MaxDepth << " and size " << DAG.size() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {" << *IJ.first << " <-> " << *IJ.second << "} of depth " << MaxDepth << " and size " << DAG.size() << "\n"; } } while (0); |
1855 | |
1856 | // At this point the DAG has been constructed, but, may contain |
1857 | // contradictory children (meaning that different children of |
1858 | // some dag node may be attempting to fuse the same instruction). |
1859 | // So now we walk the dag again, in the case of a conflict, |
1860 | // keep only the child with the largest depth. To break a tie, |
1861 | // favor the first child. |
1862 | |
1863 | DenseSet<ValuePair> PrunedDAG; |
1864 | pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs, |
1865 | PairableInstUsers, PairableInstUserMap, |
1866 | PairableInstUserPairSet, |
1867 | ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck); |
1868 | |
1869 | int EffSize = 0; |
1870 | if (TTI) { |
1871 | DenseSet<Value *> PrunedDAGInstrs; |
1872 | for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), |
1873 | E = PrunedDAG.end(); S != E; ++S) { |
1874 | PrunedDAGInstrs.insert(S->first); |
1875 | PrunedDAGInstrs.insert(S->second); |
1876 | } |
1877 | |
1878 | // The set of pairs that have already contributed to the total cost. |
1879 | DenseSet<ValuePair> IncomingPairs; |
1880 | |
1881 | // If the cost model were perfect, this might not be necessary; but we |
1882 | // need to make sure that we don't get stuck vectorizing our own |
1883 | // shuffle chains. |
1884 | bool HasNontrivialInsts = false; |
1885 | |
1886 | // The node weights represent the cost savings associated with |
1887 | // fusing the pair of instructions. |
1888 | for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), |
1889 | E = PrunedDAG.end(); S != E; ++S) { |
1890 | if (!isa<ShuffleVectorInst>(S->first) && |
1891 | !isa<InsertElementInst>(S->first) && |
1892 | !isa<ExtractElementInst>(S->first)) |
1893 | HasNontrivialInsts = true; |
1894 | |
1895 | bool FlipOrder = false; |
1896 | |
1897 | if (getDepthFactor(S->first)) { |
1898 | int ESContrib = CandidatePairCostSavings.find(*S)->second; |
1899 | DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tweight {" << *S->first << " <-> " << *S-> second << "} = " << ESContrib << "\n"; } } while (0) |
1900 | << *S->first << " <-> " << *S->second << "} = " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tweight {" << *S->first << " <-> " << *S-> second << "} = " << ESContrib << "\n"; } } while (0) |
1901 | ESContrib << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tweight {" << *S->first << " <-> " << *S-> second << "} = " << ESContrib << "\n"; } } while (0); |
1902 | EffSize += ESContrib; |
1903 | } |
1904 | |
1905 | // The edge weights contribute in a negative sense: they represent |
1906 | // the cost of shuffles. |
1907 | DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS = |
1908 | ConnectedPairDeps.find(*S); |
1909 | if (SS != ConnectedPairDeps.end()) { |
1910 | unsigned NumDepsDirect = 0, NumDepsSwap = 0; |
1911 | for (std::vector<ValuePair>::iterator T = SS->second.begin(), |
1912 | TE = SS->second.end(); T != TE; ++T) { |
1913 | VPPair Q(*S, *T); |
1914 | if (!PrunedDAG.count(Q.second)) |
1915 | continue; |
1916 | DenseMap<VPPair, unsigned>::iterator R = |
1917 | PairConnectionTypes.find(VPPair(Q.second, Q.first)); |
1918 | assert(R != PairConnectionTypes.end() &&((R != PairConnectionTypes.end() && "Cannot find pair connection type" ) ? static_cast<void> (0) : __assert_fail ("R != PairConnectionTypes.end() && \"Cannot find pair connection type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 1919, __PRETTY_FUNCTION__)) |
1919 | "Cannot find pair connection type")((R != PairConnectionTypes.end() && "Cannot find pair connection type" ) ? static_cast<void> (0) : __assert_fail ("R != PairConnectionTypes.end() && \"Cannot find pair connection type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 1919, __PRETTY_FUNCTION__)); |
1920 | if (R->second == PairConnectionDirect) |
1921 | ++NumDepsDirect; |
1922 | else if (R->second == PairConnectionSwap) |
1923 | ++NumDepsSwap; |
1924 | } |
1925 | |
1926 | // If there are more swaps than direct connections, then |
1927 | // the pair order will be flipped during fusion. So the real |
1928 | // number of swaps is the minimum number. |
1929 | FlipOrder = !FixedOrderPairs.count(*S) && |
1930 | ((NumDepsSwap > NumDepsDirect) || |
1931 | FixedOrderPairs.count(ValuePair(S->second, S->first))); |
1932 | |
1933 | for (std::vector<ValuePair>::iterator T = SS->second.begin(), |
1934 | TE = SS->second.end(); T != TE; ++T) { |
1935 | VPPair Q(*S, *T); |
1936 | if (!PrunedDAG.count(Q.second)) |
1937 | continue; |
1938 | DenseMap<VPPair, unsigned>::iterator R = |
1939 | PairConnectionTypes.find(VPPair(Q.second, Q.first)); |
1940 | assert(R != PairConnectionTypes.end() &&((R != PairConnectionTypes.end() && "Cannot find pair connection type" ) ? static_cast<void> (0) : __assert_fail ("R != PairConnectionTypes.end() && \"Cannot find pair connection type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 1941, __PRETTY_FUNCTION__)) |
1941 | "Cannot find pair connection type")((R != PairConnectionTypes.end() && "Cannot find pair connection type" ) ? static_cast<void> (0) : __assert_fail ("R != PairConnectionTypes.end() && \"Cannot find pair connection type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 1941, __PRETTY_FUNCTION__)); |
1942 | Type *Ty1 = Q.second.first->getType(), |
1943 | *Ty2 = Q.second.second->getType(); |
1944 | Type *VTy = getVecTypeForPair(Ty1, Ty2); |
1945 | if ((R->second == PairConnectionDirect && FlipOrder) || |
1946 | (R->second == PairConnectionSwap && !FlipOrder) || |
1947 | R->second == PairConnectionSplat) { |
1948 | int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, |
1949 | VTy, VTy); |
1950 | |
1951 | if (VTy->getVectorNumElements() == 2) { |
1952 | if (R->second == PairConnectionSplat) |
1953 | ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( |
1954 | TargetTransformInfo::SK_Broadcast, VTy)); |
1955 | else |
1956 | ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( |
1957 | TargetTransformInfo::SK_Reverse, VTy)); |
1958 | } |
1959 | |
1960 | DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *Q.second.first << " <-> " << *Q. second.second << "} -> {" << *S->first << " <-> " << *S->second << "} = " << ESContrib << "\n"; } } while (0) |
1961 | *Q.second.first << " <-> " << *Q.second.second <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *Q.second.first << " <-> " << *Q. second.second << "} -> {" << *S->first << " <-> " << *S->second << "} = " << ESContrib << "\n"; } } while (0) |
1962 | "} -> {" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *Q.second.first << " <-> " << *Q. second.second << "} -> {" << *S->first << " <-> " << *S->second << "} = " << ESContrib << "\n"; } } while (0) |
1963 | *S->first << " <-> " << *S->second << "} = " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *Q.second.first << " <-> " << *Q. second.second << "} -> {" << *S->first << " <-> " << *S->second << "} = " << ESContrib << "\n"; } } while (0) |
1964 | ESContrib << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *Q.second.first << " <-> " << *Q. second.second << "} -> {" << *S->first << " <-> " << *S->second << "} = " << ESContrib << "\n"; } } while (0); |
1965 | EffSize -= ESContrib; |
1966 | } |
1967 | } |
1968 | } |
1969 | |
1970 | // Compute the cost of outgoing edges. We assume that edges outgoing |
1971 | // to shuffles, inserts or extracts can be merged, and so contribute |
1972 | // no additional cost. |
1973 | if (!S->first->getType()->isVoidTy()) { |
1974 | Type *Ty1 = S->first->getType(), |
1975 | *Ty2 = S->second->getType(); |
1976 | Type *VTy = getVecTypeForPair(Ty1, Ty2); |
1977 | |
1978 | bool NeedsExtraction = false; |
1979 | for (User *U : S->first->users()) { |
1980 | if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) { |
1981 | // Shuffle can be folded if it has no other input |
1982 | if (isa<UndefValue>(SI->getOperand(1))) |
1983 | continue; |
1984 | } |
1985 | if (isa<ExtractElementInst>(U)) |
1986 | continue; |
1987 | if (PrunedDAGInstrs.count(U)) |
1988 | continue; |
1989 | NeedsExtraction = true; |
1990 | break; |
1991 | } |
1992 | |
1993 | if (NeedsExtraction) { |
1994 | int ESContrib; |
1995 | if (Ty1->isVectorTy()) { |
1996 | ESContrib = (int) getInstrCost(Instruction::ShuffleVector, |
1997 | Ty1, VTy); |
1998 | ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( |
1999 | TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1)); |
2000 | } else |
2001 | ESContrib = (int) TTI->getVectorInstrCost( |
2002 | Instruction::ExtractElement, VTy, 0); |
2003 | |
2004 | DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *S->first << "} = " << ESContrib << "\n"; } } while (0) |
2005 | *S->first << "} = " << ESContrib << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *S->first << "} = " << ESContrib << "\n"; } } while (0); |
2006 | EffSize -= ESContrib; |
2007 | } |
2008 | |
2009 | NeedsExtraction = false; |
2010 | for (User *U : S->second->users()) { |
2011 | if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) { |
2012 | // Shuffle can be folded if it has no other input |
2013 | if (isa<UndefValue>(SI->getOperand(1))) |
2014 | continue; |
2015 | } |
2016 | if (isa<ExtractElementInst>(U)) |
2017 | continue; |
2018 | if (PrunedDAGInstrs.count(U)) |
2019 | continue; |
2020 | NeedsExtraction = true; |
2021 | break; |
2022 | } |
2023 | |
2024 | if (NeedsExtraction) { |
2025 | int ESContrib; |
2026 | if (Ty2->isVectorTy()) { |
2027 | ESContrib = (int) getInstrCost(Instruction::ShuffleVector, |
2028 | Ty2, VTy); |
2029 | ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( |
2030 | TargetTransformInfo::SK_ExtractSubvector, VTy, |
2031 | Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2)); |
2032 | } else |
2033 | ESContrib = (int) TTI->getVectorInstrCost( |
2034 | Instruction::ExtractElement, VTy, 1); |
2035 | DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *S->second << "} = " << ESContrib << "\n"; } } while (0) |
2036 | *S->second << "} = " << ESContrib << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *S->second << "} = " << ESContrib << "\n"; } } while (0); |
2037 | EffSize -= ESContrib; |
2038 | } |
2039 | } |
2040 | |
2041 | // Compute the cost of incoming edges. |
2042 | if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { |
2043 | Instruction *S1 = cast<Instruction>(S->first), |
2044 | *S2 = cast<Instruction>(S->second); |
2045 | for (unsigned o = 0; o < S1->getNumOperands(); ++o) { |
2046 | Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); |
2047 | |
2048 | // Combining constants into vector constants (or small vector |
2049 | // constants into larger ones are assumed free). |
2050 | if (isa<Constant>(O1) && isa<Constant>(O2)) |
2051 | continue; |
2052 | |
2053 | if (FlipOrder) |
2054 | std::swap(O1, O2); |
2055 | |
2056 | ValuePair VP = ValuePair(O1, O2); |
2057 | ValuePair VPR = ValuePair(O2, O1); |
2058 | |
2059 | // Internal edges are not handled here. |
2060 | if (PrunedDAG.count(VP) || PrunedDAG.count(VPR)) |
2061 | continue; |
2062 | |
2063 | Type *Ty1 = O1->getType(), |
2064 | *Ty2 = O2->getType(); |
2065 | Type *VTy = getVecTypeForPair(Ty1, Ty2); |
2066 | |
2067 | // Combining vector operations of the same type is also assumed |
2068 | // folded with other operations. |
2069 | if (Ty1 == Ty2) { |
2070 | // If both are insert elements, then both can be widened. |
2071 | InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1), |
2072 | *IEO2 = dyn_cast<InsertElementInst>(O2); |
2073 | if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2)) |
2074 | continue; |
2075 | // If both are extract elements, and both have the same input |
2076 | // type, then they can be replaced with a shuffle |
2077 | ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1), |
2078 | *EIO2 = dyn_cast<ExtractElementInst>(O2); |
2079 | if (EIO1 && EIO2 && |
2080 | EIO1->getOperand(0)->getType() == |
2081 | EIO2->getOperand(0)->getType()) |
2082 | continue; |
2083 | // If both are a shuffle with equal operand types and only two |
2084 | // unqiue operands, then they can be replaced with a single |
2085 | // shuffle |
2086 | ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1), |
2087 | *SIO2 = dyn_cast<ShuffleVectorInst>(O2); |
2088 | if (SIO1 && SIO2 && |
2089 | SIO1->getOperand(0)->getType() == |
2090 | SIO2->getOperand(0)->getType()) { |
2091 | SmallSet<Value *, 4> SIOps; |
2092 | SIOps.insert(SIO1->getOperand(0)); |
2093 | SIOps.insert(SIO1->getOperand(1)); |
2094 | SIOps.insert(SIO2->getOperand(0)); |
2095 | SIOps.insert(SIO2->getOperand(1)); |
2096 | if (SIOps.size() <= 2) |
2097 | continue; |
2098 | } |
2099 | } |
2100 | |
2101 | int ESContrib; |
2102 | // This pair has already been formed. |
2103 | if (IncomingPairs.count(VP)) { |
2104 | continue; |
2105 | } else if (IncomingPairs.count(VPR)) { |
2106 | ESContrib = (int) getInstrCost(Instruction::ShuffleVector, |
2107 | VTy, VTy); |
2108 | |
2109 | if (VTy->getVectorNumElements() == 2) |
2110 | ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( |
2111 | TargetTransformInfo::SK_Reverse, VTy)); |
2112 | } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { |
2113 | ESContrib = (int) TTI->getVectorInstrCost( |
2114 | Instruction::InsertElement, VTy, 0); |
2115 | ESContrib += (int) TTI->getVectorInstrCost( |
2116 | Instruction::InsertElement, VTy, 1); |
2117 | } else if (!Ty1->isVectorTy()) { |
2118 | // O1 needs to be inserted into a vector of size O2, and then |
2119 | // both need to be shuffled together. |
2120 | ESContrib = (int) TTI->getVectorInstrCost( |
2121 | Instruction::InsertElement, Ty2, 0); |
2122 | ESContrib += (int) getInstrCost(Instruction::ShuffleVector, |
2123 | VTy, Ty2); |
2124 | } else if (!Ty2->isVectorTy()) { |
2125 | // O2 needs to be inserted into a vector of size O1, and then |
2126 | // both need to be shuffled together. |
2127 | ESContrib = (int) TTI->getVectorInstrCost( |
2128 | Instruction::InsertElement, Ty1, 0); |
2129 | ESContrib += (int) getInstrCost(Instruction::ShuffleVector, |
2130 | VTy, Ty1); |
2131 | } else { |
2132 | Type *TyBig = Ty1, *TySmall = Ty2; |
2133 | if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) |
2134 | std::swap(TyBig, TySmall); |
2135 | |
2136 | ESContrib = (int) getInstrCost(Instruction::ShuffleVector, |
2137 | VTy, TyBig); |
2138 | if (TyBig != TySmall) |
2139 | ESContrib += (int) getInstrCost(Instruction::ShuffleVector, |
2140 | TyBig, TySmall); |
2141 | } |
2142 | |
2143 | DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *O1 << " <-> " << *O2 << "} = " << ESContrib << "\n"; } } while (0) |
2144 | << *O1 << " <-> " << *O2 << "} = " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *O1 << " <-> " << *O2 << "} = " << ESContrib << "\n"; } } while (0) |
2145 | ESContrib << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tcost {" << *O1 << " <-> " << *O2 << "} = " << ESContrib << "\n"; } } while (0); |
2146 | EffSize -= ESContrib; |
2147 | IncomingPairs.insert(VP); |
2148 | } |
2149 | } |
2150 | } |
2151 | |
2152 | if (!HasNontrivialInsts) { |
2153 | DEBUG(if (DebugPairSelection) dbgs() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tNo non-trivial instructions in DAG;" " override to zero effective size\n"; } } while (0) |
2154 | "\tNo non-trivial instructions in DAG;"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tNo non-trivial instructions in DAG;" " override to zero effective size\n"; } } while (0) |
2155 | " override to zero effective size\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "\tNo non-trivial instructions in DAG;" " override to zero effective size\n"; } } while (0); |
2156 | EffSize = 0; |
2157 | } |
2158 | } else { |
2159 | for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), |
2160 | E = PrunedDAG.end(); S != E; ++S) |
2161 | EffSize += (int) getDepthFactor(S->first); |
2162 | } |
2163 | |
2164 | DEBUG(if (DebugPairSelection)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "BBV: found pruned DAG for pair {" << *IJ.first << " <-> " << *IJ.second << "} of depth " << MaxDepth << " and size " << PrunedDAG.size() << " (effective size: " << EffSize << ")\n"; } } while (0) |
2165 | dbgs() << "BBV: found pruned DAG for pair {"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "BBV: found pruned DAG for pair {" << *IJ.first << " <-> " << *IJ.second << "} of depth " << MaxDepth << " and size " << PrunedDAG.size() << " (effective size: " << EffSize << ")\n"; } } while (0) |
2166 | << *IJ.first << " <-> " << *IJ.second << "} of depth " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "BBV: found pruned DAG for pair {" << *IJ.first << " <-> " << *IJ.second << "} of depth " << MaxDepth << " and size " << PrunedDAG.size() << " (effective size: " << EffSize << ")\n"; } } while (0) |
2167 | MaxDepth << " and size " << PrunedDAG.size() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "BBV: found pruned DAG for pair {" << *IJ.first << " <-> " << *IJ.second << "} of depth " << MaxDepth << " and size " << PrunedDAG.size() << " (effective size: " << EffSize << ")\n"; } } while (0) |
2168 | " (effective size: " << EffSize << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (DebugPairSelection) dbgs() << "BBV: found pruned DAG for pair {" << *IJ.first << " <-> " << *IJ.second << "} of depth " << MaxDepth << " and size " << PrunedDAG.size() << " (effective size: " << EffSize << ")\n"; } } while (0); |
2169 | if (((TTI && !UseChainDepthWithTI) || |
2170 | MaxDepth >= Config.ReqChainDepth) && |
2171 | EffSize > 0 && EffSize > BestEffSize) { |
2172 | BestMaxDepth = MaxDepth; |
2173 | BestEffSize = EffSize; |
2174 | BestDAG = PrunedDAG; |
2175 | } |
2176 | } |
2177 | } |
2178 | |
2179 | // Given the list of candidate pairs, this function selects those |
2180 | // that will be fused into vector instructions. |
2181 | void BBVectorize::choosePairs( |
2182 | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
2183 | DenseSet<ValuePair> &CandidatePairsSet, |
2184 | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
2185 | std::vector<Value *> &PairableInsts, |
2186 | DenseSet<ValuePair> &FixedOrderPairs, |
2187 | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
2188 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
2189 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, |
2190 | DenseSet<ValuePair> &PairableInstUsers, |
2191 | DenseMap<Value *, Value *>& ChosenPairs) { |
2192 | bool UseCycleCheck = |
2193 | CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck; |
2194 | |
2195 | DenseMap<Value *, std::vector<Value *> > CandidatePairs2; |
2196 | for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(), |
2197 | E = CandidatePairsSet.end(); I != E; ++I) { |
2198 | std::vector<Value *> &JJ = CandidatePairs2[I->second]; |
2199 | if (JJ.empty()) JJ.reserve(32); |
2200 | JJ.push_back(I->first); |
2201 | } |
2202 | |
2203 | DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap; |
2204 | DenseSet<VPPair> PairableInstUserPairSet; |
2205 | for (std::vector<Value *>::iterator I = PairableInsts.begin(), |
2206 | E = PairableInsts.end(); I != E; ++I) { |
2207 | // The number of possible pairings for this variable: |
2208 | size_t NumChoices = CandidatePairs.lookup(*I).size(); |
2209 | if (!NumChoices) continue; |
2210 | |
2211 | std::vector<Value *> &JJ = CandidatePairs[*I]; |
2212 | |
2213 | // The best pair to choose and its dag: |
2214 | size_t BestMaxDepth = 0; |
2215 | int BestEffSize = 0; |
2216 | DenseSet<ValuePair> BestDAG; |
2217 | findBestDAGFor(CandidatePairs, CandidatePairsSet, |
2218 | CandidatePairCostSavings, |
2219 | PairableInsts, FixedOrderPairs, PairConnectionTypes, |
2220 | ConnectedPairs, ConnectedPairDeps, |
2221 | PairableInstUsers, PairableInstUserMap, |
2222 | PairableInstUserPairSet, ChosenPairs, |
2223 | BestDAG, BestMaxDepth, BestEffSize, *I, JJ, |
2224 | UseCycleCheck); |
2225 | |
2226 | if (BestDAG.empty()) |
2227 | continue; |
2228 | |
2229 | // A dag has been chosen (or not) at this point. If no dag was |
2230 | // chosen, then this instruction, I, cannot be paired (and is no longer |
2231 | // considered). |
2232 | |
2233 | DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: selected pairs in the best DAG for: " << *cast<Instruction>(*I) << "\n"; } } while (0) |
2234 | << *cast<Instruction>(*I) << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: selected pairs in the best DAG for: " << *cast<Instruction>(*I) << "\n"; } } while (0); |
2235 | |
2236 | for (DenseSet<ValuePair>::iterator S = BestDAG.begin(), |
2237 | SE2 = BestDAG.end(); S != SE2; ++S) { |
2238 | // Insert the members of this dag into the list of chosen pairs. |
2239 | ChosenPairs.insert(ValuePair(S->first, S->second)); |
2240 | DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: selected pair: " << *S->first << " <-> " << *S->second << "\n"; } } while (0) |
2241 | *S->second << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: selected pair: " << *S->first << " <-> " << *S->second << "\n"; } } while (0); |
2242 | |
2243 | // Remove all candidate pairs that have values in the chosen dag. |
2244 | std::vector<Value *> &KK = CandidatePairs[S->first]; |
2245 | for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end(); |
2246 | K != KE; ++K) { |
2247 | if (*K == S->second) |
2248 | continue; |
2249 | |
2250 | CandidatePairsSet.erase(ValuePair(S->first, *K)); |
2251 | } |
2252 | |
2253 | std::vector<Value *> &LL = CandidatePairs2[S->second]; |
2254 | for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end(); |
2255 | L != LE; ++L) { |
2256 | if (*L == S->first) |
2257 | continue; |
2258 | |
2259 | CandidatePairsSet.erase(ValuePair(*L, S->second)); |
2260 | } |
2261 | |
2262 | std::vector<Value *> &MM = CandidatePairs[S->second]; |
2263 | for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end(); |
2264 | M != ME; ++M) { |
2265 | assert(*M != S->first && "Flipped pair in candidate list?")((*M != S->first && "Flipped pair in candidate list?" ) ? static_cast<void> (0) : __assert_fail ("*M != S->first && \"Flipped pair in candidate list?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 2265, __PRETTY_FUNCTION__)); |
2266 | CandidatePairsSet.erase(ValuePair(S->second, *M)); |
2267 | } |
2268 | |
2269 | std::vector<Value *> &NN = CandidatePairs2[S->first]; |
2270 | for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end(); |
2271 | N != NE; ++N) { |
2272 | assert(*N != S->second && "Flipped pair in candidate list?")((*N != S->second && "Flipped pair in candidate list?" ) ? static_cast<void> (0) : __assert_fail ("*N != S->second && \"Flipped pair in candidate list?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 2272, __PRETTY_FUNCTION__)); |
2273 | CandidatePairsSet.erase(ValuePair(*N, S->first)); |
2274 | } |
2275 | } |
2276 | } |
2277 | |
2278 | DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"; } } while (0); |
2279 | } |
2280 | |
2281 | std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, |
2282 | unsigned n = 0) { |
2283 | if (!I->hasName()) |
2284 | return ""; |
2285 | |
2286 | return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + |
2287 | (n > 0 ? "." + utostr(n) : "")).str(); |
2288 | } |
2289 | |
2290 | // Returns the value that is to be used as the pointer input to the vector |
2291 | // instruction that fuses I with J. |
2292 | Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, |
2293 | Instruction *I, Instruction *J, unsigned o) { |
2294 | Value *IPtr, *JPtr; |
2295 | unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; |
2296 | int64_t OffsetInElmts; |
2297 | |
2298 | // Note: the analysis might fail here, that is why the pair order has |
2299 | // been precomputed (OffsetInElmts must be unused here). |
2300 | (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, |
2301 | IAddressSpace, JAddressSpace, |
2302 | OffsetInElmts, false); |
2303 | |
2304 | // The pointer value is taken to be the one with the lowest offset. |
2305 | Value *VPtr = IPtr; |
2306 | |
2307 | Type *ArgTypeI = IPtr->getType()->getPointerElementType(); |
2308 | Type *ArgTypeJ = JPtr->getType()->getPointerElementType(); |
2309 | Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); |
2310 | Type *VArgPtrType |
2311 | = PointerType::get(VArgType, |
2312 | IPtr->getType()->getPointerAddressSpace()); |
2313 | return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), |
2314 | /* insert before */ I); |
2315 | } |
2316 | |
2317 | void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, |
2318 | unsigned MaskOffset, unsigned NumInElem, |
2319 | unsigned NumInElem1, unsigned IdxOffset, |
2320 | std::vector<Constant*> &Mask) { |
2321 | unsigned NumElem1 = J->getType()->getVectorNumElements(); |
2322 | for (unsigned v = 0; v < NumElem1; ++v) { |
2323 | int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); |
2324 | if (m < 0) { |
2325 | Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); |
2326 | } else { |
2327 | unsigned mm = m + (int) IdxOffset; |
2328 | if (m >= (int) NumInElem1) |
2329 | mm += (int) NumInElem; |
2330 | |
2331 | Mask[v+MaskOffset] = |
2332 | ConstantInt::get(Type::getInt32Ty(Context), mm); |
2333 | } |
2334 | } |
2335 | } |
2336 | |
2337 | // Returns the value that is to be used as the vector-shuffle mask to the |
2338 | // vector instruction that fuses I with J. |
2339 | Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, |
2340 | Instruction *I, Instruction *J) { |
2341 | // This is the shuffle mask. We need to append the second |
2342 | // mask to the first, and the numbers need to be adjusted. |
2343 | |
2344 | Type *ArgTypeI = I->getType(); |
2345 | Type *ArgTypeJ = J->getType(); |
2346 | Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); |
2347 | |
2348 | unsigned NumElemI = ArgTypeI->getVectorNumElements(); |
2349 | |
2350 | // Get the total number of elements in the fused vector type. |
2351 | // By definition, this must equal the number of elements in |
2352 | // the final mask. |
2353 | unsigned NumElem = VArgType->getVectorNumElements(); |
2354 | std::vector<Constant*> Mask(NumElem); |
2355 | |
2356 | Type *OpTypeI = I->getOperand(0)->getType(); |
2357 | unsigned NumInElemI = OpTypeI->getVectorNumElements(); |
2358 | Type *OpTypeJ = J->getOperand(0)->getType(); |
2359 | unsigned NumInElemJ = OpTypeJ->getVectorNumElements(); |
2360 | |
2361 | // The fused vector will be: |
2362 | // ----------------------------------------------------- |
2363 | // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | |
2364 | // ----------------------------------------------------- |
2365 | // from which we'll extract NumElem total elements (where the first NumElemI |
2366 | // of them come from the mask in I and the remainder come from the mask |
2367 | // in J. |
2368 | |
2369 | // For the mask from the first pair... |
2370 | fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, |
2371 | 0, Mask); |
2372 | |
2373 | // For the mask from the second pair... |
2374 | fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, |
2375 | NumInElemI, Mask); |
2376 | |
2377 | return ConstantVector::get(Mask); |
2378 | } |
2379 | |
2380 | bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, |
2381 | Instruction *J, unsigned o, Value *&LOp, |
2382 | unsigned numElemL, |
2383 | Type *ArgTypeL, Type *ArgTypeH, |
2384 | bool IBeforeJ, unsigned IdxOff) { |
2385 | bool ExpandedIEChain = false; |
2386 | if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { |
2387 | // If we have a pure insertelement chain, then this can be rewritten |
2388 | // into a chain that directly builds the larger type. |
2389 | if (isPureIEChain(LIE)) { |
2390 | SmallVector<Value *, 8> VectElemts(numElemL, |
2391 | UndefValue::get(ArgTypeL->getScalarType())); |
2392 | InsertElementInst *LIENext = LIE; |
2393 | do { |
2394 | unsigned Idx = |
2395 | cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue(); |
2396 | VectElemts[Idx] = LIENext->getOperand(1); |
2397 | } while ((LIENext = |
2398 | dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); |
2399 | |
2400 | LIENext = nullptr; |
2401 | Value *LIEPrev = UndefValue::get(ArgTypeH); |
2402 | for (unsigned i = 0; i < numElemL; ++i) { |
2403 | if (isa<UndefValue>(VectElemts[i])) continue; |
2404 | LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], |
2405 | ConstantInt::get(Type::getInt32Ty(Context), |
2406 | i + IdxOff), |
2407 | getReplacementName(IBeforeJ ? I : J, |
2408 | true, o, i+1)); |
2409 | LIENext->insertBefore(IBeforeJ ? J : I); |
2410 | LIEPrev = LIENext; |
2411 | } |
2412 | |
2413 | LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH); |
2414 | ExpandedIEChain = true; |
2415 | } |
2416 | } |
2417 | |
2418 | return ExpandedIEChain; |
2419 | } |
2420 | |
2421 | static unsigned getNumScalarElements(Type *Ty) { |
2422 | if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) |
2423 | return VecTy->getNumElements(); |
2424 | return 1; |
2425 | } |
2426 | |
2427 | // Returns the value to be used as the specified operand of the vector |
2428 | // instruction that fuses I with J. |
2429 | Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, |
2430 | Instruction *J, unsigned o, bool IBeforeJ) { |
2431 | Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); |
2432 | Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); |
2433 | |
2434 | // Compute the fused vector type for this operand |
2435 | Type *ArgTypeI = I->getOperand(o)->getType(); |
2436 | Type *ArgTypeJ = J->getOperand(o)->getType(); |
2437 | VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); |
2438 | |
2439 | Instruction *L = I, *H = J; |
2440 | Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; |
2441 | |
2442 | unsigned numElemL = getNumScalarElements(ArgTypeL); |
2443 | unsigned numElemH = getNumScalarElements(ArgTypeH); |
2444 | |
2445 | Value *LOp = L->getOperand(o); |
2446 | Value *HOp = H->getOperand(o); |
2447 | unsigned numElem = VArgType->getNumElements(); |
2448 | |
2449 | // First, we check if we can reuse the "original" vector outputs (if these |
2450 | // exist). We might need a shuffle. |
2451 | ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp); |
2452 | ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp); |
2453 | ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp); |
2454 | ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp); |
2455 | |
2456 | // FIXME: If we're fusing shuffle instructions, then we can't apply this |
2457 | // optimization. The input vectors to the shuffle might be a different |
2458 | // length from the shuffle outputs. Unfortunately, the replacement |
2459 | // shuffle mask has already been formed, and the mask entries are sensitive |
2460 | // to the sizes of the inputs. |
2461 | bool IsSizeChangeShuffle = |
2462 | isa<ShuffleVectorInst>(L) && |
2463 | (LOp->getType() != L->getType() || HOp->getType() != H->getType()); |
2464 | |
2465 | if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { |
2466 | // We can have at most two unique vector inputs. |
2467 | bool CanUseInputs = true; |
2468 | Value *I1, *I2 = nullptr; |
2469 | if (LEE) { |
2470 | I1 = LEE->getOperand(0); |
2471 | } else { |
2472 | I1 = LSV->getOperand(0); |
2473 | I2 = LSV->getOperand(1); |
2474 | if (I2 == I1 || isa<UndefValue>(I2)) |
2475 | I2 = nullptr; |
2476 | } |
2477 | |
2478 | if (HEE) { |
2479 | Value *I3 = HEE->getOperand(0); |
2480 | if (!I2 && I3 != I1) |
2481 | I2 = I3; |
2482 | else if (I3 != I1 && I3 != I2) |
2483 | CanUseInputs = false; |
2484 | } else { |
2485 | Value *I3 = HSV->getOperand(0); |
2486 | if (!I2 && I3 != I1) |
2487 | I2 = I3; |
2488 | else if (I3 != I1 && I3 != I2) |
2489 | CanUseInputs = false; |
2490 | |
2491 | if (CanUseInputs) { |
2492 | Value *I4 = HSV->getOperand(1); |
2493 | if (!isa<UndefValue>(I4)) { |
2494 | if (!I2 && I4 != I1) |
2495 | I2 = I4; |
2496 | else if (I4 != I1 && I4 != I2) |
2497 | CanUseInputs = false; |
2498 | } |
2499 | } |
2500 | } |
2501 | |
2502 | if (CanUseInputs) { |
2503 | unsigned LOpElem = |
2504 | cast<Instruction>(LOp)->getOperand(0)->getType() |
2505 | ->getVectorNumElements(); |
2506 | |
2507 | unsigned HOpElem = |
2508 | cast<Instruction>(HOp)->getOperand(0)->getType() |
2509 | ->getVectorNumElements(); |
2510 | |
2511 | // We have one or two input vectors. We need to map each index of the |
2512 | // operands to the index of the original vector. |
2513 | SmallVector<std::pair<int, int>, 8> II(numElem); |
2514 | for (unsigned i = 0; i < numElemL; ++i) { |
2515 | int Idx, INum; |
2516 | if (LEE) { |
2517 | Idx = |
2518 | cast<ConstantInt>(LEE->getOperand(1))->getSExtValue(); |
2519 | INum = LEE->getOperand(0) == I1 ? 0 : 1; |
2520 | } else { |
2521 | Idx = LSV->getMaskValue(i); |
2522 | if (Idx < (int) LOpElem) { |
2523 | INum = LSV->getOperand(0) == I1 ? 0 : 1; |
2524 | } else { |
2525 | Idx -= LOpElem; |
2526 | INum = LSV->getOperand(1) == I1 ? 0 : 1; |
2527 | } |
2528 | } |
2529 | |
2530 | II[i] = std::pair<int, int>(Idx, INum); |
2531 | } |
2532 | for (unsigned i = 0; i < numElemH; ++i) { |
2533 | int Idx, INum; |
2534 | if (HEE) { |
2535 | Idx = |
2536 | cast<ConstantInt>(HEE->getOperand(1))->getSExtValue(); |
2537 | INum = HEE->getOperand(0) == I1 ? 0 : 1; |
2538 | } else { |
2539 | Idx = HSV->getMaskValue(i); |
2540 | if (Idx < (int) HOpElem) { |
2541 | INum = HSV->getOperand(0) == I1 ? 0 : 1; |
2542 | } else { |
2543 | Idx -= HOpElem; |
2544 | INum = HSV->getOperand(1) == I1 ? 0 : 1; |
2545 | } |
2546 | } |
2547 | |
2548 | II[i + numElemL] = std::pair<int, int>(Idx, INum); |
2549 | } |
2550 | |
2551 | // We now have an array which tells us from which index of which |
2552 | // input vector each element of the operand comes. |
2553 | VectorType *I1T = cast<VectorType>(I1->getType()); |
2554 | unsigned I1Elem = I1T->getNumElements(); |
2555 | |
2556 | if (!I2) { |
2557 | // In this case there is only one underlying vector input. Check for |
2558 | // the trivial case where we can use the input directly. |
2559 | if (I1Elem == numElem) { |
2560 | bool ElemInOrder = true; |
2561 | for (unsigned i = 0; i < numElem; ++i) { |
2562 | if (II[i].first != (int) i && II[i].first != -1) { |
2563 | ElemInOrder = false; |
2564 | break; |
2565 | } |
2566 | } |
2567 | |
2568 | if (ElemInOrder) |
2569 | return I1; |
2570 | } |
2571 | |
2572 | // A shuffle is needed. |
2573 | std::vector<Constant *> Mask(numElem); |
2574 | for (unsigned i = 0; i < numElem; ++i) { |
2575 | int Idx = II[i].first; |
2576 | if (Idx == -1) |
2577 | Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); |
2578 | else |
2579 | Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); |
2580 | } |
2581 | |
2582 | Instruction *S = |
2583 | new ShuffleVectorInst(I1, UndefValue::get(I1T), |
2584 | ConstantVector::get(Mask), |
2585 | getReplacementName(IBeforeJ ? I : J, |
2586 | true, o)); |
2587 | S->insertBefore(IBeforeJ ? J : I); |
2588 | return S; |
2589 | } |
2590 | |
2591 | VectorType *I2T = cast<VectorType>(I2->getType()); |
2592 | unsigned I2Elem = I2T->getNumElements(); |
2593 | |
2594 | // This input comes from two distinct vectors. The first step is to |
2595 | // make sure that both vectors are the same length. If not, the |
2596 | // smaller one will need to grow before they can be shuffled together. |
2597 | if (I1Elem < I2Elem) { |
2598 | std::vector<Constant *> Mask(I2Elem); |
2599 | unsigned v = 0; |
2600 | for (; v < I1Elem; ++v) |
2601 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2602 | for (; v < I2Elem; ++v) |
2603 | Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); |
2604 | |
2605 | Instruction *NewI1 = |
2606 | new ShuffleVectorInst(I1, UndefValue::get(I1T), |
2607 | ConstantVector::get(Mask), |
2608 | getReplacementName(IBeforeJ ? I : J, |
2609 | true, o, 1)); |
2610 | NewI1->insertBefore(IBeforeJ ? J : I); |
2611 | I1 = NewI1; |
2612 | I1T = I2T; |
2613 | I1Elem = I2Elem; |
2614 | } else if (I1Elem > I2Elem) { |
2615 | std::vector<Constant *> Mask(I1Elem); |
2616 | unsigned v = 0; |
2617 | for (; v < I2Elem; ++v) |
2618 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2619 | for (; v < I1Elem; ++v) |
2620 | Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); |
2621 | |
2622 | Instruction *NewI2 = |
2623 | new ShuffleVectorInst(I2, UndefValue::get(I2T), |
2624 | ConstantVector::get(Mask), |
2625 | getReplacementName(IBeforeJ ? I : J, |
2626 | true, o, 1)); |
2627 | NewI2->insertBefore(IBeforeJ ? J : I); |
2628 | I2 = NewI2; |
2629 | I2T = I1T; |
Value stored to 'I2T' is never read | |
2630 | I2Elem = I1Elem; |
2631 | } |
2632 | |
2633 | // Now that both I1 and I2 are the same length we can shuffle them |
2634 | // together (and use the result). |
2635 | std::vector<Constant *> Mask(numElem); |
2636 | for (unsigned v = 0; v < numElem; ++v) { |
2637 | if (II[v].first == -1) { |
2638 | Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); |
2639 | } else { |
2640 | int Idx = II[v].first + II[v].second * I1Elem; |
2641 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); |
2642 | } |
2643 | } |
2644 | |
2645 | Instruction *NewOp = |
2646 | new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), |
2647 | getReplacementName(IBeforeJ ? I : J, true, o)); |
2648 | NewOp->insertBefore(IBeforeJ ? J : I); |
2649 | return NewOp; |
2650 | } |
2651 | } |
2652 | |
2653 | Type *ArgType = ArgTypeL; |
2654 | if (numElemL < numElemH) { |
2655 | if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, |
2656 | ArgTypeL, VArgType, IBeforeJ, 1)) { |
2657 | // This is another short-circuit case: we're combining a scalar into |
2658 | // a vector that is formed by an IE chain. We've just expanded the IE |
2659 | // chain, now insert the scalar and we're done. |
2660 | |
2661 | Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, |
2662 | getReplacementName(IBeforeJ ? I : J, true, o)); |
2663 | S->insertBefore(IBeforeJ ? J : I); |
2664 | return S; |
2665 | } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, |
2666 | ArgTypeH, IBeforeJ)) { |
2667 | // The two vector inputs to the shuffle must be the same length, |
2668 | // so extend the smaller vector to be the same length as the larger one. |
2669 | Instruction *NLOp; |
2670 | if (numElemL > 1) { |
2671 | |
2672 | std::vector<Constant *> Mask(numElemH); |
2673 | unsigned v = 0; |
2674 | for (; v < numElemL; ++v) |
2675 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2676 | for (; v < numElemH; ++v) |
2677 | Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); |
2678 | |
2679 | NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), |
2680 | ConstantVector::get(Mask), |
2681 | getReplacementName(IBeforeJ ? I : J, |
2682 | true, o, 1)); |
2683 | } else { |
2684 | NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, |
2685 | getReplacementName(IBeforeJ ? I : J, |
2686 | true, o, 1)); |
2687 | } |
2688 | |
2689 | NLOp->insertBefore(IBeforeJ ? J : I); |
2690 | LOp = NLOp; |
2691 | } |
2692 | |
2693 | ArgType = ArgTypeH; |
2694 | } else if (numElemL > numElemH) { |
2695 | if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, |
2696 | ArgTypeH, VArgType, IBeforeJ)) { |
2697 | Instruction *S = |
2698 | InsertElementInst::Create(LOp, HOp, |
2699 | ConstantInt::get(Type::getInt32Ty(Context), |
2700 | numElemL), |
2701 | getReplacementName(IBeforeJ ? I : J, |
2702 | true, o)); |
2703 | S->insertBefore(IBeforeJ ? J : I); |
2704 | return S; |
2705 | } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, |
2706 | ArgTypeL, IBeforeJ)) { |
2707 | Instruction *NHOp; |
2708 | if (numElemH > 1) { |
2709 | std::vector<Constant *> Mask(numElemL); |
2710 | unsigned v = 0; |
2711 | for (; v < numElemH; ++v) |
2712 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2713 | for (; v < numElemL; ++v) |
2714 | Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); |
2715 | |
2716 | NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), |
2717 | ConstantVector::get(Mask), |
2718 | getReplacementName(IBeforeJ ? I : J, |
2719 | true, o, 1)); |
2720 | } else { |
2721 | NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, |
2722 | getReplacementName(IBeforeJ ? I : J, |
2723 | true, o, 1)); |
2724 | } |
2725 | |
2726 | NHOp->insertBefore(IBeforeJ ? J : I); |
2727 | HOp = NHOp; |
2728 | } |
2729 | } |
2730 | |
2731 | if (ArgType->isVectorTy()) { |
2732 | unsigned numElem = VArgType->getVectorNumElements(); |
2733 | std::vector<Constant*> Mask(numElem); |
2734 | for (unsigned v = 0; v < numElem; ++v) { |
2735 | unsigned Idx = v; |
2736 | // If the low vector was expanded, we need to skip the extra |
2737 | // undefined entries. |
2738 | if (v >= numElemL && numElemH > numElemL) |
2739 | Idx += (numElemH - numElemL); |
2740 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); |
2741 | } |
2742 | |
2743 | Instruction *BV = new ShuffleVectorInst(LOp, HOp, |
2744 | ConstantVector::get(Mask), |
2745 | getReplacementName(IBeforeJ ? I : J, true, o)); |
2746 | BV->insertBefore(IBeforeJ ? J : I); |
2747 | return BV; |
2748 | } |
2749 | |
2750 | Instruction *BV1 = InsertElementInst::Create( |
2751 | UndefValue::get(VArgType), LOp, CV0, |
2752 | getReplacementName(IBeforeJ ? I : J, |
2753 | true, o, 1)); |
2754 | BV1->insertBefore(IBeforeJ ? J : I); |
2755 | Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, |
2756 | getReplacementName(IBeforeJ ? I : J, |
2757 | true, o, 2)); |
2758 | BV2->insertBefore(IBeforeJ ? J : I); |
2759 | return BV2; |
2760 | } |
2761 | |
2762 | // This function creates an array of values that will be used as the inputs |
2763 | // to the vector instruction that fuses I with J. |
2764 | void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, |
2765 | Instruction *I, Instruction *J, |
2766 | SmallVectorImpl<Value *> &ReplacedOperands, |
2767 | bool IBeforeJ) { |
2768 | unsigned NumOperands = I->getNumOperands(); |
2769 | |
2770 | for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { |
2771 | // Iterate backward so that we look at the store pointer |
2772 | // first and know whether or not we need to flip the inputs. |
2773 | |
2774 | if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { |
2775 | // This is the pointer for a load/store instruction. |
2776 | ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); |
2777 | continue; |
2778 | } else if (isa<CallInst>(I)) { |
2779 | Function *F = cast<CallInst>(I)->getCalledFunction(); |
2780 | Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); |
2781 | if (o == NumOperands-1) { |
2782 | BasicBlock &BB = *I->getParent(); |
2783 | |
2784 | Module *M = BB.getParent()->getParent(); |
2785 | Type *ArgTypeI = I->getType(); |
2786 | Type *ArgTypeJ = J->getType(); |
2787 | Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); |
2788 | |
2789 | ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); |
2790 | continue; |
2791 | } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || |
2792 | IID == Intrinsic::cttz) && o == 1) { |
2793 | // The second argument of powi/ctlz/cttz is a single integer/constant |
2794 | // and we've already checked that both arguments are equal. |
2795 | // As a result, we just keep I's second argument. |
2796 | ReplacedOperands[o] = I->getOperand(o); |
2797 | continue; |
2798 | } |
2799 | } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { |
2800 | ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); |
2801 | continue; |
2802 | } |
2803 | |
2804 | ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); |
2805 | } |
2806 | } |
2807 | |
2808 | // This function creates two values that represent the outputs of the |
2809 | // original I and J instructions. These are generally vector shuffles |
2810 | // or extracts. In many cases, these will end up being unused and, thus, |
2811 | // eliminated by later passes. |
2812 | void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, |
2813 | Instruction *J, Instruction *K, |
2814 | Instruction *&InsertionPt, |
2815 | Instruction *&K1, Instruction *&K2) { |
2816 | if (isa<StoreInst>(I)) { |
2817 | AA->replaceWithNewValue(I, K); |
2818 | AA->replaceWithNewValue(J, K); |
2819 | } else { |
2820 | Type *IType = I->getType(); |
2821 | Type *JType = J->getType(); |
2822 | |
2823 | VectorType *VType = getVecTypeForPair(IType, JType); |
2824 | unsigned numElem = VType->getNumElements(); |
2825 | |
2826 | unsigned numElemI = getNumScalarElements(IType); |
2827 | unsigned numElemJ = getNumScalarElements(JType); |
2828 | |
2829 | if (IType->isVectorTy()) { |
2830 | std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); |
2831 | for (unsigned v = 0; v < numElemI; ++v) { |
2832 | Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2833 | Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v); |
2834 | } |
2835 | |
2836 | K1 = new ShuffleVectorInst(K, UndefValue::get(VType), |
2837 | ConstantVector::get( Mask1), |
2838 | getReplacementName(K, false, 1)); |
2839 | } else { |
2840 | Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); |
2841 | K1 = ExtractElementInst::Create(K, CV0, |
2842 | getReplacementName(K, false, 1)); |
2843 | } |
2844 | |
2845 | if (JType->isVectorTy()) { |
2846 | std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ); |
2847 | for (unsigned v = 0; v < numElemJ; ++v) { |
2848 | Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2849 | Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v); |
2850 | } |
2851 | |
2852 | K2 = new ShuffleVectorInst(K, UndefValue::get(VType), |
2853 | ConstantVector::get( Mask2), |
2854 | getReplacementName(K, false, 2)); |
2855 | } else { |
2856 | Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); |
2857 | K2 = ExtractElementInst::Create(K, CV1, |
2858 | getReplacementName(K, false, 2)); |
2859 | } |
2860 | |
2861 | K1->insertAfter(K); |
2862 | K2->insertAfter(K1); |
2863 | InsertionPt = K2; |
2864 | } |
2865 | } |
2866 | |
2867 | // Move all uses of the function I (including pairing-induced uses) after J. |
2868 | bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, |
2869 | DenseSet<ValuePair> &LoadMoveSetPairs, |
2870 | Instruction *I, Instruction *J) { |
2871 | // Skip to the first instruction past I. |
2872 | BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); |
2873 | |
2874 | DenseSet<Value *> Users; |
2875 | AliasSetTracker WriteSet(*AA); |
2876 | if (I->mayWriteToMemory()) WriteSet.add(I); |
2877 | |
2878 | for (; cast<Instruction>(L) != J; ++L) |
2879 | (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs); |
2880 | |
2881 | assert(cast<Instruction>(L) == J &&((cast<Instruction>(L) == J && "Tracking has not proceeded far enough to check for dependencies" ) ? static_cast<void> (0) : __assert_fail ("cast<Instruction>(L) == J && \"Tracking has not proceeded far enough to check for dependencies\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 2882, __PRETTY_FUNCTION__)) |
2882 | "Tracking has not proceeded far enough to check for dependencies")((cast<Instruction>(L) == J && "Tracking has not proceeded far enough to check for dependencies" ) ? static_cast<void> (0) : __assert_fail ("cast<Instruction>(L) == J && \"Tracking has not proceeded far enough to check for dependencies\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 2882, __PRETTY_FUNCTION__)); |
2883 | // If J is now in the use set of I, then trackUsesOfI will return true |
2884 | // and we have a dependency cycle (and the fusing operation must abort). |
2885 | return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs); |
2886 | } |
2887 | |
2888 | // Move all uses of the function I (including pairing-induced uses) after J. |
2889 | void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, |
2890 | DenseSet<ValuePair> &LoadMoveSetPairs, |
2891 | Instruction *&InsertionPt, |
2892 | Instruction *I, Instruction *J) { |
2893 | // Skip to the first instruction past I. |
2894 | BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); |
2895 | |
2896 | DenseSet<Value *> Users; |
2897 | AliasSetTracker WriteSet(*AA); |
2898 | if (I->mayWriteToMemory()) WriteSet.add(I); |
2899 | |
2900 | for (; cast<Instruction>(L) != J;) { |
2901 | if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) { |
2902 | // Move this instruction |
2903 | Instruction *InstToMove = L; ++L; |
2904 | |
2905 | DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: moving: " << * InstToMove << " to after " << *InsertionPt << "\n"; } } while (0) |
2906 | " to after " << *InsertionPt << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: moving: " << * InstToMove << " to after " << *InsertionPt << "\n"; } } while (0); |
2907 | InstToMove->removeFromParent(); |
2908 | InstToMove->insertAfter(InsertionPt); |
2909 | InsertionPt = InstToMove; |
2910 | } else { |
2911 | ++L; |
2912 | } |
2913 | } |
2914 | } |
2915 | |
2916 | // Collect all load instruction that are in the move set of a given first |
2917 | // pair member. These loads depend on the first instruction, I, and so need |
2918 | // to be moved after J (the second instruction) when the pair is fused. |
2919 | void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, |
2920 | DenseMap<Value *, Value *> &ChosenPairs, |
2921 | DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, |
2922 | DenseSet<ValuePair> &LoadMoveSetPairs, |
2923 | Instruction *I) { |
2924 | // Skip to the first instruction past I. |
2925 | BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); |
2926 | |
2927 | DenseSet<Value *> Users; |
2928 | AliasSetTracker WriteSet(*AA); |
2929 | if (I->mayWriteToMemory()) WriteSet.add(I); |
2930 | |
2931 | // Note: We cannot end the loop when we reach J because J could be moved |
2932 | // farther down the use chain by another instruction pairing. Also, J |
2933 | // could be before I if this is an inverted input. |
2934 | for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { |
2935 | if (trackUsesOfI(Users, WriteSet, I, L)) { |
2936 | if (L->mayReadFromMemory()) { |
2937 | LoadMoveSet[L].push_back(I); |
2938 | LoadMoveSetPairs.insert(ValuePair(L, I)); |
2939 | } |
2940 | } |
2941 | } |
2942 | } |
2943 | |
2944 | // In cases where both load/stores and the computation of their pointers |
2945 | // are chosen for vectorization, we can end up in a situation where the |
2946 | // aliasing analysis starts returning different query results as the |
2947 | // process of fusing instruction pairs continues. Because the algorithm |
2948 | // relies on finding the same use dags here as were found earlier, we'll |
2949 | // need to precompute the necessary aliasing information here and then |
2950 | // manually update it during the fusion process. |
2951 | void BBVectorize::collectLoadMoveSet(BasicBlock &BB, |
2952 | std::vector<Value *> &PairableInsts, |
2953 | DenseMap<Value *, Value *> &ChosenPairs, |
2954 | DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, |
2955 | DenseSet<ValuePair> &LoadMoveSetPairs) { |
2956 | for (std::vector<Value *>::iterator PI = PairableInsts.begin(), |
2957 | PIE = PairableInsts.end(); PI != PIE; ++PI) { |
2958 | DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); |
2959 | if (P == ChosenPairs.end()) continue; |
2960 | |
2961 | Instruction *I = cast<Instruction>(P->first); |
2962 | collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, |
2963 | LoadMoveSetPairs, I); |
2964 | } |
2965 | } |
2966 | |
2967 | // This function fuses the chosen instruction pairs into vector instructions, |
2968 | // taking care preserve any needed scalar outputs and, then, it reorders the |
2969 | // remaining instructions as needed (users of the first member of the pair |
2970 | // need to be moved to after the location of the second member of the pair |
2971 | // because the vector instruction is inserted in the location of the pair's |
2972 | // second member). |
2973 | void BBVectorize::fuseChosenPairs(BasicBlock &BB, |
2974 | std::vector<Value *> &PairableInsts, |
2975 | DenseMap<Value *, Value *> &ChosenPairs, |
2976 | DenseSet<ValuePair> &FixedOrderPairs, |
2977 | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
2978 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
2979 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) { |
2980 | LLVMContext& Context = BB.getContext(); |
2981 | |
2982 | // During the vectorization process, the order of the pairs to be fused |
2983 | // could be flipped. So we'll add each pair, flipped, into the ChosenPairs |
2984 | // list. After a pair is fused, the flipped pair is removed from the list. |
2985 | DenseSet<ValuePair> FlippedPairs; |
2986 | for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), |
2987 | E = ChosenPairs.end(); P != E; ++P) |
2988 | FlippedPairs.insert(ValuePair(P->second, P->first)); |
2989 | for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), |
2990 | E = FlippedPairs.end(); P != E; ++P) |
2991 | ChosenPairs.insert(*P); |
2992 | |
2993 | DenseMap<Value *, std::vector<Value *> > LoadMoveSet; |
2994 | DenseSet<ValuePair> LoadMoveSetPairs; |
2995 | collectLoadMoveSet(BB, PairableInsts, ChosenPairs, |
2996 | LoadMoveSet, LoadMoveSetPairs); |
2997 | |
2998 | DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: initial: \n" << BB << "\n"; } } while (0); |
2999 | |
3000 | for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { |
3001 | DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); |
3002 | if (P == ChosenPairs.end()) { |
3003 | ++PI; |
3004 | continue; |
3005 | } |
3006 | |
3007 | if (getDepthFactor(P->first) == 0) { |
3008 | // These instructions are not really fused, but are tracked as though |
3009 | // they are. Any case in which it would be interesting to fuse them |
3010 | // will be taken care of by InstCombine. |
3011 | --NumFusedOps; |
3012 | ++PI; |
3013 | continue; |
3014 | } |
3015 | |
3016 | Instruction *I = cast<Instruction>(P->first), |
3017 | *J = cast<Instruction>(P->second); |
3018 | |
3019 | DEBUG(dbgs() << "BBV: fusing: " << *I <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusing: " << * I << " <-> " << *J << "\n"; } } while (0) |
3020 | " <-> " << *J << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusing: " << * I << " <-> " << *J << "\n"; } } while (0); |
3021 | |
3022 | // Remove the pair and flipped pair from the list. |
3023 | DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); |
3024 | assert(FP != ChosenPairs.end() && "Flipped pair not found in list")((FP != ChosenPairs.end() && "Flipped pair not found in list" ) ? static_cast<void> (0) : __assert_fail ("FP != ChosenPairs.end() && \"Flipped pair not found in list\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 3024, __PRETTY_FUNCTION__)); |
3025 | ChosenPairs.erase(FP); |
3026 | ChosenPairs.erase(P); |
3027 | |
3028 | if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) { |
3029 | DEBUG(dbgs() << "BBV: fusion of: " << *I <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusion of: " << *I << " <-> " << *J << " aborted because of non-trivial dependency cycle\n" ; } } while (0) |
3030 | " <-> " << *J <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusion of: " << *I << " <-> " << *J << " aborted because of non-trivial dependency cycle\n" ; } } while (0) |
3031 | " aborted because of non-trivial dependency cycle\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: fusion of: " << *I << " <-> " << *J << " aborted because of non-trivial dependency cycle\n" ; } } while (0); |
3032 | --NumFusedOps; |
3033 | ++PI; |
3034 | continue; |
3035 | } |
3036 | |
3037 | // If the pair must have the other order, then flip it. |
3038 | bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); |
3039 | if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { |
3040 | // This pair does not have a fixed order, and so we might want to |
3041 | // flip it if that will yield fewer shuffles. We count the number |
3042 | // of dependencies connected via swaps, and those directly connected, |
3043 | // and flip the order if the number of swaps is greater. |
3044 | bool OrigOrder = true; |
3045 | DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ = |
3046 | ConnectedPairDeps.find(ValuePair(I, J)); |
3047 | if (IJ == ConnectedPairDeps.end()) { |
3048 | IJ = ConnectedPairDeps.find(ValuePair(J, I)); |
3049 | OrigOrder = false; |
3050 | } |
3051 | |
3052 | if (IJ != ConnectedPairDeps.end()) { |
3053 | unsigned NumDepsDirect = 0, NumDepsSwap = 0; |
3054 | for (std::vector<ValuePair>::iterator T = IJ->second.begin(), |
3055 | TE = IJ->second.end(); T != TE; ++T) { |
3056 | VPPair Q(IJ->first, *T); |
3057 | DenseMap<VPPair, unsigned>::iterator R = |
3058 | PairConnectionTypes.find(VPPair(Q.second, Q.first)); |
3059 | assert(R != PairConnectionTypes.end() &&((R != PairConnectionTypes.end() && "Cannot find pair connection type" ) ? static_cast<void> (0) : __assert_fail ("R != PairConnectionTypes.end() && \"Cannot find pair connection type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 3060, __PRETTY_FUNCTION__)) |
3060 | "Cannot find pair connection type")((R != PairConnectionTypes.end() && "Cannot find pair connection type" ) ? static_cast<void> (0) : __assert_fail ("R != PairConnectionTypes.end() && \"Cannot find pair connection type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 3060, __PRETTY_FUNCTION__)); |
3061 | if (R->second == PairConnectionDirect) |
3062 | ++NumDepsDirect; |
3063 | else if (R->second == PairConnectionSwap) |
3064 | ++NumDepsSwap; |
3065 | } |
3066 | |
3067 | if (!OrigOrder) |
3068 | std::swap(NumDepsDirect, NumDepsSwap); |
3069 | |
3070 | if (NumDepsSwap > NumDepsDirect) { |
3071 | FlipPairOrder = true; |
3072 | DEBUG(dbgs() << "BBV: reordering pair: " << *I <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: reordering pair: " << *I << " <-> " << *J << "\n"; } } while (0) |
3073 | " <-> " << *J << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: reordering pair: " << *I << " <-> " << *J << "\n"; } } while (0); |
3074 | } |
3075 | } |
3076 | } |
3077 | |
3078 | Instruction *L = I, *H = J; |
3079 | if (FlipPairOrder) |
3080 | std::swap(H, L); |
3081 | |
3082 | // If the pair being fused uses the opposite order from that in the pair |
3083 | // connection map, then we need to flip the types. |
3084 | DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL = |
3085 | ConnectedPairs.find(ValuePair(H, L)); |
3086 | if (HL != ConnectedPairs.end()) |
3087 | for (std::vector<ValuePair>::iterator T = HL->second.begin(), |
3088 | TE = HL->second.end(); T != TE; ++T) { |
3089 | VPPair Q(HL->first, *T); |
3090 | DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q); |
3091 | assert(R != PairConnectionTypes.end() &&((R != PairConnectionTypes.end() && "Cannot find pair connection type" ) ? static_cast<void> (0) : __assert_fail ("R != PairConnectionTypes.end() && \"Cannot find pair connection type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 3092, __PRETTY_FUNCTION__)) |
3092 | "Cannot find pair connection type")((R != PairConnectionTypes.end() && "Cannot find pair connection type" ) ? static_cast<void> (0) : __assert_fail ("R != PairConnectionTypes.end() && \"Cannot find pair connection type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 3092, __PRETTY_FUNCTION__)); |
3093 | if (R->second == PairConnectionDirect) |
3094 | R->second = PairConnectionSwap; |
3095 | else if (R->second == PairConnectionSwap) |
3096 | R->second = PairConnectionDirect; |
3097 | } |
3098 | |
3099 | bool LBeforeH = !FlipPairOrder; |
3100 | unsigned NumOperands = I->getNumOperands(); |
3101 | SmallVector<Value *, 3> ReplacedOperands(NumOperands); |
3102 | getReplacementInputsForPair(Context, L, H, ReplacedOperands, |
3103 | LBeforeH); |
3104 | |
3105 | // Make a copy of the original operation, change its type to the vector |
3106 | // type and replace its operands with the vector operands. |
3107 | Instruction *K = L->clone(); |
3108 | if (L->hasName()) |
3109 | K->takeName(L); |
3110 | else if (H->hasName()) |
3111 | K->takeName(H); |
3112 | |
3113 | if (!isa<StoreInst>(K)) |
3114 | K->mutateType(getVecTypeForPair(L->getType(), H->getType())); |
3115 | |
3116 | unsigned KnownIDs[] = { |
3117 | LLVMContext::MD_tbaa, |
3118 | LLVMContext::MD_alias_scope, |
3119 | LLVMContext::MD_noalias, |
3120 | LLVMContext::MD_fpmath |
3121 | }; |
3122 | combineMetadata(K, H, KnownIDs); |
3123 | K->intersectOptionalDataWith(H); |
3124 | |
3125 | for (unsigned o = 0; o < NumOperands; ++o) |
3126 | K->setOperand(o, ReplacedOperands[o]); |
3127 | |
3128 | K->insertAfter(J); |
3129 | |
3130 | // Instruction insertion point: |
3131 | Instruction *InsertionPt = K; |
3132 | Instruction *K1 = nullptr, *K2 = nullptr; |
3133 | replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); |
3134 | |
3135 | // The use dag of the first original instruction must be moved to after |
3136 | // the location of the second instruction. The entire use dag of the |
3137 | // first instruction is disjoint from the input dag of the second |
3138 | // (by definition), and so commutes with it. |
3139 | |
3140 | moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J); |
3141 | |
3142 | if (!isa<StoreInst>(I)) { |
3143 | L->replaceAllUsesWith(K1); |
3144 | H->replaceAllUsesWith(K2); |
3145 | AA->replaceWithNewValue(L, K1); |
3146 | AA->replaceWithNewValue(H, K2); |
3147 | } |
3148 | |
3149 | // Instructions that may read from memory may be in the load move set. |
3150 | // Once an instruction is fused, we no longer need its move set, and so |
3151 | // the values of the map never need to be updated. However, when a load |
3152 | // is fused, we need to merge the entries from both instructions in the |
3153 | // pair in case those instructions were in the move set of some other |
3154 | // yet-to-be-fused pair. The loads in question are the keys of the map. |
3155 | if (I->mayReadFromMemory()) { |
3156 | std::vector<ValuePair> NewSetMembers; |
3157 | DenseMap<Value *, std::vector<Value *> >::iterator II = |
3158 | LoadMoveSet.find(I); |
3159 | if (II != LoadMoveSet.end()) |
3160 | for (std::vector<Value *>::iterator N = II->second.begin(), |
3161 | NE = II->second.end(); N != NE; ++N) |
3162 | NewSetMembers.push_back(ValuePair(K, *N)); |
3163 | DenseMap<Value *, std::vector<Value *> >::iterator JJ = |
3164 | LoadMoveSet.find(J); |
3165 | if (JJ != LoadMoveSet.end()) |
3166 | for (std::vector<Value *>::iterator N = JJ->second.begin(), |
3167 | NE = JJ->second.end(); N != NE; ++N) |
3168 | NewSetMembers.push_back(ValuePair(K, *N)); |
3169 | for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), |
3170 | AE = NewSetMembers.end(); A != AE; ++A) { |
3171 | LoadMoveSet[A->first].push_back(A->second); |
3172 | LoadMoveSetPairs.insert(*A); |
3173 | } |
3174 | } |
3175 | |
3176 | // Before removing I, set the iterator to the next instruction. |
3177 | PI = std::next(BasicBlock::iterator(I)); |
3178 | if (cast<Instruction>(PI) == J) |
3179 | ++PI; |
3180 | |
3181 | SE->forgetValue(I); |
3182 | SE->forgetValue(J); |
3183 | I->eraseFromParent(); |
3184 | J->eraseFromParent(); |
3185 | |
3186 | DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << BB << "\n"; } } while (0) |
3187 | BB << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << BB << "\n"; } } while (0); |
3188 | } |
3189 | |
3190 | DEBUG(dbgs() << "BBV: final: \n" << BB << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("bb-vectorize")) { dbgs() << "BBV: final: \n" << BB << "\n"; } } while (0); |
3191 | } |
3192 | } |
3193 | |
3194 | char BBVectorize::ID = 0; |
3195 | static const char bb_vectorize_name[] = "Basic-Block Vectorization"; |
3196 | INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)static void* initializeBBVectorizePassOnce(PassRegistry & Registry) { |
3197 | INITIALIZE_AG_DEPENDENCY(AliasAnalysis)initializeAliasAnalysisAnalysisGroup(Registry); |
3198 | INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)initializeTargetTransformInfoAnalysisGroup(Registry); |
3199 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry); |
3200 | INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)initializeScalarEvolutionPass(Registry); |
3201 | INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)PassInfo *PI = new PassInfo(bb_vectorize_name, "bb-vectorize" , & BBVectorize ::ID, PassInfo::NormalCtor_t(callDefaultCtor < BBVectorize >), false, false); Registry.registerPass( *PI, true); return PI; } void llvm::initializeBBVectorizePass (PassRegistry &Registry) { static volatile sys::cas_flag initialized = 0; sys::cas_flag old_val = sys::CompareAndSwap(&initialized , 1, 0); if (old_val == 0) { initializeBBVectorizePassOnce(Registry ); sys::MemoryFence(); AnnotateIgnoreWritesBegin("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 3201); AnnotateHappensBefore("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 3201, &initialized); initialized = 2; AnnotateIgnoreWritesEnd ("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 3201); } else { sys::cas_flag tmp = initialized; sys::MemoryFence (); while (tmp != 2) { tmp = initialized; sys::MemoryFence(); } } AnnotateHappensAfter("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/BBVectorize.cpp" , 3201, &initialized); } |
3202 | |
3203 | BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { |
3204 | return new BBVectorize(C); |
3205 | } |
3206 | |
3207 | bool |
3208 | llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { |
3209 | BBVectorize BBVectorizer(P, C); |
3210 | return BBVectorizer.vectorizeBB(BB); |
3211 | } |
3212 | |
3213 | //===----------------------------------------------------------------------===// |
3214 | VectorizeConfig::VectorizeConfig() { |
3215 | VectorBits = ::VectorBits; |
3216 | VectorizeBools = !::NoBools; |
3217 | VectorizeInts = !::NoInts; |
3218 | VectorizeFloats = !::NoFloats; |
3219 | VectorizePointers = !::NoPointers; |
3220 | VectorizeCasts = !::NoCasts; |
3221 | VectorizeMath = !::NoMath; |
3222 | VectorizeBitManipulations = !::NoBitManipulation; |
3223 | VectorizeFMA = !::NoFMA; |
3224 | VectorizeSelect = !::NoSelect; |
3225 | VectorizeCmp = !::NoCmp; |
3226 | VectorizeGEP = !::NoGEP; |
3227 | VectorizeMemOps = !::NoMemOps; |
3228 | AlignedOnly = ::AlignedOnly; |
3229 | ReqChainDepth= ::ReqChainDepth; |
3230 | SearchLimit = ::SearchLimit; |
3231 | MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; |
3232 | SplatBreaksChain = ::SplatBreaksChain; |
3233 | MaxInsts = ::MaxInsts; |
3234 | MaxPairs = ::MaxPairs; |
3235 | MaxIter = ::MaxIter; |
3236 | Pow2LenOnly = ::Pow2LenOnly; |
3237 | NoMemOpBoost = ::NoMemOpBoost; |
3238 | FastDep = ::FastDep; |
3239 | } |