Bug Summary

File:lib/Transforms/Vectorize/SLPVectorizer.cpp
Warning:line 3370, column 22
Called C++ object pointer is null

Annotated Source Code

1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
11// stores that can be put together into vector-stores. Next, it attempts to
12// construct vectorizable tree using the use-def chains. If a profitable tree
13// was found, the SLP vectorizer performs vectorization on the tree.
14//
15// The pass is inspired by the work described in the paper:
16// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
17//
18//===----------------------------------------------------------------------===//
19
20#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/DenseMap.h"
23#include "llvm/ADT/DenseSet.h"
24#include "llvm/ADT/MapVector.h"
25#include "llvm/ADT/None.h"
26#include "llvm/ADT/Optional.h"
27#include "llvm/ADT/PostOrderIterator.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/SmallPtrSet.h"
31#include "llvm/ADT/SmallSet.h"
32#include "llvm/ADT/SmallVector.h"
33#include "llvm/ADT/Statistic.h"
34#include "llvm/ADT/iterator.h"
35#include "llvm/ADT/iterator_range.h"
36#include "llvm/Analysis/AliasAnalysis.h"
37#include "llvm/Analysis/CodeMetrics.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/LoopAccessAnalysis.h"
41#include "llvm/Analysis/LoopInfo.h"
42#include "llvm/Analysis/MemoryLocation.h"
43#include "llvm/Analysis/OptimizationRemarkEmitter.h"
44#include "llvm/Analysis/ScalarEvolution.h"
45#include "llvm/Analysis/ScalarEvolutionExpressions.h"
46#include "llvm/Analysis/TargetLibraryInfo.h"
47#include "llvm/Analysis/TargetTransformInfo.h"
48#include "llvm/Analysis/ValueTracking.h"
49#include "llvm/Analysis/VectorUtils.h"
50#include "llvm/IR/Attributes.h"
51#include "llvm/IR/BasicBlock.h"
52#include "llvm/IR/Constant.h"
53#include "llvm/IR/Constants.h"
54#include "llvm/IR/DataLayout.h"
55#include "llvm/IR/DebugLoc.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
62#include "llvm/IR/Instructions.h"
63#include "llvm/IR/IntrinsicInst.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/NoFolder.h"
67#include "llvm/IR/Operator.h"
68#include "llvm/IR/PassManager.h"
69#include "llvm/IR/PatternMatch.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
75#include "llvm/IR/Verifier.h"
76#include "llvm/Pass.h"
77#include "llvm/Support/Casting.h"
78#include "llvm/Support/CommandLine.h"
79#include "llvm/Support/Compiler.h"
80#include "llvm/Support/DOTGraphTraits.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/ErrorHandling.h"
83#include "llvm/Support/GraphWriter.h"
84#include "llvm/Support/KnownBits.h"
85#include "llvm/Support/MathExtras.h"
86#include "llvm/Support/raw_ostream.h"
87#include "llvm/Transforms/Utils/LoopUtils.h"
88#include "llvm/Transforms/Vectorize.h"
89#include <algorithm>
90#include <cassert>
91#include <cstdint>
92#include <iterator>
93#include <memory>
94#include <set>
95#include <string>
96#include <tuple>
97#include <utility>
98#include <vector>
99
100using namespace llvm;
101using namespace llvm::PatternMatch;
102using namespace slpvectorizer;
103
104#define SV_NAME"slp-vectorizer" "slp-vectorizer"
105#define DEBUG_TYPE"SLP" "SLP"
106
107STATISTIC(NumVectorInstructions, "Number of vector instructions generated")static llvm::Statistic NumVectorInstructions = {"SLP", "NumVectorInstructions"
, "Number of vector instructions generated", {0}, false}
;
108
109static cl::opt<int>
110 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
111 cl::desc("Only vectorize if you gain more than this "
112 "number "));
113
114static cl::opt<bool>
115ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
116 cl::desc("Attempt to vectorize horizontal reductions"));
117
118static cl::opt<bool> ShouldStartVectorizeHorAtStore(
119 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
120 cl::desc(
121 "Attempt to vectorize horizontal reductions feeding into a store"));
122
123static cl::opt<int>
124MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
125 cl::desc("Attempt to vectorize for this register size in bits"));
126
127/// Limits the size of scheduling regions in a block.
128/// It avoid long compile times for _very_ large blocks where vector
129/// instructions are spread over a wide range.
130/// This limit is way higher than needed by real-world functions.
131static cl::opt<int>
132ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
133 cl::desc("Limit the size of the SLP scheduling region per block"));
134
135static cl::opt<int> MinVectorRegSizeOption(
136 "slp-min-reg-size", cl::init(128), cl::Hidden,
137 cl::desc("Attempt to vectorize for this register size in bits"));
138
139static cl::opt<unsigned> RecursionMaxDepth(
140 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
141 cl::desc("Limit the recursion depth when building a vectorizable tree"));
142
143static cl::opt<unsigned> MinTreeSize(
144 "slp-min-tree-size", cl::init(3), cl::Hidden,
145 cl::desc("Only vectorize small trees if they are fully vectorizable"));
146
147static cl::opt<bool>
148 ViewSLPTree("view-slp-tree", cl::Hidden,
149 cl::desc("Display the SLP trees with Graphviz"));
150
151// Limit the number of alias checks. The limit is chosen so that
152// it has no negative effect on the llvm benchmarks.
153static const unsigned AliasedCheckLimit = 10;
154
155// Another limit for the alias checks: The maximum distance between load/store
156// instructions where alias checks are done.
157// This limit is useful for very large basic blocks.
158static const unsigned MaxMemDepDistance = 160;
159
160/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
161/// regions to be handled.
162static const int MinScheduleRegionSize = 16;
163
164/// \brief Predicate for the element types that the SLP vectorizer supports.
165///
166/// The most important thing to filter here are types which are invalid in LLVM
167/// vectors. We also filter target specific types which have absolutely no
168/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
169/// avoids spending time checking the cost model and realizing that they will
170/// be inevitably scalarized.
171static bool isValidElementType(Type *Ty) {
172 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
173 !Ty->isPPC_FP128Ty();
174}
175
176/// \returns true if all of the instructions in \p VL are in the same block or
177/// false otherwise.
178static bool allSameBlock(ArrayRef<Value *> VL) {
179 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
180 if (!I0)
181 return false;
182 BasicBlock *BB = I0->getParent();
183 for (int i = 1, e = VL.size(); i < e; i++) {
184 Instruction *I = dyn_cast<Instruction>(VL[i]);
185 if (!I)
186 return false;
187
188 if (BB != I->getParent())
189 return false;
190 }
191 return true;
192}
193
194/// \returns True if all of the values in \p VL are constants.
195static bool allConstant(ArrayRef<Value *> VL) {
196 for (Value *i : VL)
197 if (!isa<Constant>(i))
198 return false;
199 return true;
200}
201
202/// \returns True if all of the values in \p VL are identical.
203static bool isSplat(ArrayRef<Value *> VL) {
204 for (unsigned i = 1, e = VL.size(); i < e; ++i)
205 if (VL[i] != VL[0])
206 return false;
207 return true;
208}
209
210/// Checks if the vector of instructions can be represented as a shuffle, like:
211/// %x0 = extractelement <4 x i8> %x, i32 0
212/// %x3 = extractelement <4 x i8> %x, i32 3
213/// %y1 = extractelement <4 x i8> %y, i32 1
214/// %y2 = extractelement <4 x i8> %y, i32 2
215/// %x0x0 = mul i8 %x0, %x0
216/// %x3x3 = mul i8 %x3, %x3
217/// %y1y1 = mul i8 %y1, %y1
218/// %y2y2 = mul i8 %y2, %y2
219/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
220/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
221/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
222/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
223/// ret <4 x i8> %ins4
224/// can be transformed into:
225/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
226/// i32 6>
227/// %2 = mul <4 x i8> %1, %1
228/// ret <4 x i8> %2
229/// We convert this initially to something like:
230/// %x0 = extractelement <4 x i8> %x, i32 0
231/// %x3 = extractelement <4 x i8> %x, i32 3
232/// %y1 = extractelement <4 x i8> %y, i32 1
233/// %y2 = extractelement <4 x i8> %y, i32 2
234/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
235/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
236/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
237/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
238/// %5 = mul <4 x i8> %4, %4
239/// %6 = extractelement <4 x i8> %5, i32 0
240/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
241/// %7 = extractelement <4 x i8> %5, i32 1
242/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
243/// %8 = extractelement <4 x i8> %5, i32 2
244/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
245/// %9 = extractelement <4 x i8> %5, i32 3
246/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
247/// ret <4 x i8> %ins4
248/// InstCombiner transforms this into a shuffle and vector mul
249static Optional<TargetTransformInfo::ShuffleKind>
250isShuffle(ArrayRef<Value *> VL) {
251 auto *EI0 = cast<ExtractElementInst>(VL[0]);
252 unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
253 Value *Vec1 = nullptr;
254 Value *Vec2 = nullptr;
255 enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
256 ShuffleMode CommonShuffleMode = Unknown;
257 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
258 auto *EI = cast<ExtractElementInst>(VL[I]);
259 auto *Vec = EI->getVectorOperand();
260 // All vector operands must have the same number of vector elements.
261 if (Vec->getType()->getVectorNumElements() != Size)
262 return None;
263 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
264 if (!Idx)
265 return None;
266 // Undefined behavior if Idx is negative or >= Size.
267 if (Idx->getValue().uge(Size))
268 continue;
269 unsigned IntIdx = Idx->getValue().getZExtValue();
270 // We can extractelement from undef vector.
271 if (isa<UndefValue>(Vec))
272 continue;
273 // For correct shuffling we have to have at most 2 different vector operands
274 // in all extractelement instructions.
275 if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
276 return None;
277 if (CommonShuffleMode == Permute)
278 continue;
279 // If the extract index is not the same as the operation number, it is a
280 // permutation.
281 if (IntIdx != I) {
282 CommonShuffleMode = Permute;
283 continue;
284 }
285 // Check the shuffle mode for the current operation.
286 if (!Vec1)
287 Vec1 = Vec;
288 else if (Vec != Vec1)
289 Vec2 = Vec;
290 // Example: shufflevector A, B, <0,5,2,7>
291 // I is odd and IntIdx for A == I - FirstAlternate shuffle.
292 // I is even and IntIdx for B == I - FirstAlternate shuffle.
293 // Example: shufflevector A, B, <4,1,6,3>
294 // I is even and IntIdx for A == I - SecondAlternate shuffle.
295 // I is odd and IntIdx for B == I - SecondAlternate shuffle.
296 const bool IIsEven = I & 1;
297 const bool CurrVecIsA = Vec == Vec1;
298 const bool IIsOdd = !IIsEven;
299 const bool CurrVecIsB = !CurrVecIsA;
300 ShuffleMode CurrentShuffleMode =
301 ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate
302 : SecondAlternate;
303 // Common mode is not set or the same as the shuffle mode of the current
304 // operation - alternate.
305 if (CommonShuffleMode == Unknown)
306 CommonShuffleMode = CurrentShuffleMode;
307 // Common shuffle mode is not the same as the shuffle mode of the current
308 // operation - permutation.
309 if (CommonShuffleMode != CurrentShuffleMode)
310 CommonShuffleMode = Permute;
311 }
312 // If we're not crossing lanes in different vectors, consider it as blending.
313 if ((CommonShuffleMode == FirstAlternate ||
314 CommonShuffleMode == SecondAlternate) &&
315 Vec2)
316 return TargetTransformInfo::SK_Alternate;
317 // If Vec2 was never used, we have a permutation of a single vector, otherwise
318 // we have permutation of 2 vectors.
319 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
320 : TargetTransformInfo::SK_PermuteSingleSrc;
321}
322
323///\returns Opcode that can be clubbed with \p Op to create an alternate
324/// sequence which can later be merged as a ShuffleVector instruction.
325static unsigned getAltOpcode(unsigned Op) {
326 switch (Op) {
327 case Instruction::FAdd:
328 return Instruction::FSub;
329 case Instruction::FSub:
330 return Instruction::FAdd;
331 case Instruction::Add:
332 return Instruction::Sub;
333 case Instruction::Sub:
334 return Instruction::Add;
335 default:
336 return 0;
337 }
338}
339
340static bool isOdd(unsigned Value) {
341 return Value & 1;
342}
343
344static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
345 unsigned CheckedOpcode) {
346 return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
347}
348
349/// Chooses the correct key for scheduling data. If \p Op has the same (or
350/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
351/// OpValue.
352static Value *isOneOf(Value *OpValue, Value *Op) {
353 auto *I = dyn_cast<Instruction>(Op);
354 if (!I)
355 return OpValue;
356 auto *OpInst = cast<Instruction>(OpValue);
357 unsigned OpInstOpcode = OpInst->getOpcode();
358 unsigned IOpcode = I->getOpcode();
359 if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode))
360 return Op;
361 return OpValue;
362}
363
364namespace {
365
366/// Contains data for the instructions going to be vectorized.
367struct RawInstructionsData {
368 /// Main Opcode of the instructions going to be vectorized.
369 unsigned Opcode = 0;
370
371 /// The list of instructions have some instructions with alternate opcodes.
372 bool HasAltOpcodes = false;
373};
374
375} // end anonymous namespace
376
377/// Checks the list of the vectorized instructions \p VL and returns info about
378/// this list.
379static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
380 auto *I0 = dyn_cast<Instruction>(VL[0]);
381 if (!I0)
382 return {};
383 RawInstructionsData Res;
384 unsigned Opcode = I0->getOpcode();
385 // Walk through the list of the vectorized instructions
386 // in order to check its structure described by RawInstructionsData.
387 for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
388 auto *I = dyn_cast<Instruction>(VL[Cnt]);
389 if (!I)
390 return {};
391 if (Opcode != I->getOpcode())
392 Res.HasAltOpcodes = true;
393 }
394 Res.Opcode = Opcode;
395 return Res;
396}
397
398namespace {
399
400/// Main data required for vectorization of instructions.
401struct InstructionsState {
402 /// The very first instruction in the list with the main opcode.
403 Value *OpValue = nullptr;
404
405 /// The main opcode for the list of instructions.
406 unsigned Opcode = 0;
407
408 /// Some of the instructions in the list have alternate opcodes.
409 bool IsAltShuffle = false;
410
411 InstructionsState() = default;
412 InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle)
413 : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {}
414};
415
416} // end anonymous namespace
417
418/// \returns analysis of the Instructions in \p VL described in
419/// InstructionsState, the Opcode that we suppose the whole list
420/// could be vectorized even if its structure is diverse.
421static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
422 auto Res = getMainOpcode(VL);
423 unsigned Opcode = Res.Opcode;
424 if (!Res.HasAltOpcodes)
425 return InstructionsState(VL[0], Opcode, false);
426 auto *OpInst = cast<Instruction>(VL[0]);
427 unsigned AltOpcode = getAltOpcode(Opcode);
428 // Examine each element in the list instructions VL to determine
429 // if some operations there could be considered as an alternative
430 // (for example as subtraction relates to addition operation).
431 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
432 auto *I = cast<Instruction>(VL[Cnt]);
433 unsigned InstOpcode = I->getOpcode();
434 if ((Res.HasAltOpcodes &&
435 InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
436 (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
437 return InstructionsState(OpInst, 0, false);
438 }
439 }
440 return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);
441}
442
443/// \returns true if all of the values in \p VL have the same type or false
444/// otherwise.
445static bool allSameType(ArrayRef<Value *> VL) {
446 Type *Ty = VL[0]->getType();
447 for (int i = 1, e = VL.size(); i < e; i++)
448 if (VL[i]->getType() != Ty)
449 return false;
450
451 return true;
452}
453
454/// \returns True if Extract{Value,Element} instruction extracts element Idx.
455static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
456 assert(Opcode == Instruction::ExtractElement ||(static_cast <bool> (Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) ? void (0) : __assert_fail
("Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 457, __extension__ __PRETTY_FUNCTION__))
457 Opcode == Instruction::ExtractValue)(static_cast <bool> (Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) ? void (0) : __assert_fail
("Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 457, __extension__ __PRETTY_FUNCTION__))
;
458 if (Opcode == Instruction::ExtractElement) {
459 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
460 return CI && CI->getZExtValue() == Idx;
461 } else {
462 ExtractValueInst *EI = cast<ExtractValueInst>(E);
463 return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
464 }
465}
466
467/// \returns True if in-tree use also needs extract. This refers to
468/// possible scalar operand in vectorized instruction.
469static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
470 TargetLibraryInfo *TLI) {
471 unsigned Opcode = UserInst->getOpcode();
472 switch (Opcode) {
473 case Instruction::Load: {
474 LoadInst *LI = cast<LoadInst>(UserInst);
475 return (LI->getPointerOperand() == Scalar);
476 }
477 case Instruction::Store: {
478 StoreInst *SI = cast<StoreInst>(UserInst);
479 return (SI->getPointerOperand() == Scalar);
480 }
481 case Instruction::Call: {
482 CallInst *CI = cast<CallInst>(UserInst);
483 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
484 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
485 return (CI->getArgOperand(1) == Scalar);
486 }
487 LLVM_FALLTHROUGH[[clang::fallthrough]];
488 }
489 default:
490 return false;
491 }
492}
493
494/// \returns the AA location that is being access by the instruction.
495static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
496 if (StoreInst *SI = dyn_cast<StoreInst>(I))
497 return MemoryLocation::get(SI);
498 if (LoadInst *LI = dyn_cast<LoadInst>(I))
499 return MemoryLocation::get(LI);
500 return MemoryLocation();
501}
502
503/// \returns True if the instruction is not a volatile or atomic load/store.
504static bool isSimple(Instruction *I) {
505 if (LoadInst *LI = dyn_cast<LoadInst>(I))
506 return LI->isSimple();
507 if (StoreInst *SI = dyn_cast<StoreInst>(I))
508 return SI->isSimple();
509 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
510 return !MI->isVolatile();
511 return true;
512}
513
514namespace llvm {
515
516namespace slpvectorizer {
517
518/// Bottom Up SLP Vectorizer.
519class BoUpSLP {
520public:
521 using ValueList = SmallVector<Value *, 8>;
522 using InstrList = SmallVector<Instruction *, 16>;
523 using ValueSet = SmallPtrSet<Value *, 16>;
524 using StoreList = SmallVector<StoreInst *, 8>;
525 using ExtraValueToDebugLocsMap =
526 MapVector<Value *, SmallVector<Instruction *, 2>>;
527
528 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
529 TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
530 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
531 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
532 : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
533 DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
534 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
535 // Use the vector register size specified by the target unless overridden
536 // by a command-line option.
537 // TODO: It would be better to limit the vectorization factor based on
538 // data type rather than just register size. For example, x86 AVX has
539 // 256-bit registers, but it does not support integer operations
540 // at that width (that requires AVX2).
541 if (MaxVectorRegSizeOption.getNumOccurrences())
542 MaxVecRegSize = MaxVectorRegSizeOption;
543 else
544 MaxVecRegSize = TTI->getRegisterBitWidth(true);
545
546 if (MinVectorRegSizeOption.getNumOccurrences())
547 MinVecRegSize = MinVectorRegSizeOption;
548 else
549 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
550 }
551
552 /// \brief Vectorize the tree that starts with the elements in \p VL.
553 /// Returns the vectorized root.
554 Value *vectorizeTree();
555
556 /// Vectorize the tree but with the list of externally used values \p
557 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
558 /// generated extractvalue instructions.
559 Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
560
561 /// \returns the cost incurred by unwanted spills and fills, caused by
562 /// holding live values over call sites.
563 int getSpillCost();
564
565 /// \returns the vectorization cost of the subtree that starts at \p VL.
566 /// A negative number means that this is profitable.
567 int getTreeCost();
568
569 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
570 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
571 void buildTree(ArrayRef<Value *> Roots,
572 ArrayRef<Value *> UserIgnoreLst = None);
573
574 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
575 /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
576 /// into account (anf updating it, if required) list of externally used
577 /// values stored in \p ExternallyUsedValues.
578 void buildTree(ArrayRef<Value *> Roots,
579 ExtraValueToDebugLocsMap &ExternallyUsedValues,
580 ArrayRef<Value *> UserIgnoreLst = None);
581
582 /// Clear the internal data structures that are created by 'buildTree'.
583 void deleteTree() {
584 VectorizableTree.clear();
585 ScalarToTreeEntry.clear();
586 MustGather.clear();
587 ExternalUses.clear();
588 NumLoadsWantToKeepOrder = 0;
589 NumLoadsWantToChangeOrder = 0;
590 for (auto &Iter : BlocksSchedules) {
591 BlockScheduling *BS = Iter.second.get();
592 BS->clear();
593 }
594 MinBWs.clear();
595 }
596
597 unsigned getTreeSize() const { return VectorizableTree.size(); }
598
599 /// \brief Perform LICM and CSE on the newly generated gather sequences.
600 void optimizeGatherSequence();
601
602 /// \returns true if it is beneficial to reverse the vector order.
603 bool shouldReorder() const {
604 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
605 }
606
607 /// \return The vector element size in bits to use when vectorizing the
608 /// expression tree ending at \p V. If V is a store, the size is the width of
609 /// the stored value. Otherwise, the size is the width of the largest loaded
610 /// value reaching V. This method is used by the vectorizer to calculate
611 /// vectorization factors.
612 unsigned getVectorElementSize(Value *V);
613
614 /// Compute the minimum type sizes required to represent the entries in a
615 /// vectorizable tree.
616 void computeMinimumValueSizes();
617
618 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
619 unsigned getMaxVecRegSize() const {
620 return MaxVecRegSize;
621 }
622
623 // \returns minimum vector register size as set by cl::opt.
624 unsigned getMinVecRegSize() const {
625 return MinVecRegSize;
626 }
627
628 /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
629 ///
630 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
631 unsigned canMapToVector(Type *T, const DataLayout &DL) const;
632
633 /// \returns True if the VectorizableTree is both tiny and not fully
634 /// vectorizable. We do not vectorize such trees.
635 bool isTreeTinyAndNotFullyVectorizable();
636
637 OptimizationRemarkEmitter *getORE() { return ORE; }
638
639private:
640 struct TreeEntry;
641
642 /// Checks if all users of \p I are the part of the vectorization tree.
643 bool areAllUsersVectorized(Instruction *I) const;
644
645 /// \returns the cost of the vectorizable entry.
646 int getEntryCost(TreeEntry *E);
647
648 /// This is the recursive part of buildTree.
649 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
650
651 /// \returns True if the ExtractElement/ExtractValue instructions in VL can
652 /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
653 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
654
655 /// Vectorize a single entry in the tree.
656 Value *vectorizeTree(TreeEntry *E);
657
658 /// Vectorize a single entry in the tree, starting in \p VL.
659 Value *vectorizeTree(ArrayRef<Value *> VL);
660
661 /// \returns the pointer to the vectorized value if \p VL is already
662 /// vectorized, or NULL. They may happen in cycles.
663 Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
664
665 /// \returns the scalarization cost for this type. Scalarization in this
666 /// context means the creation of vectors from a group of scalars.
667 int getGatherCost(Type *Ty);
668
669 /// \returns the scalarization cost for this list of values. Assuming that
670 /// this subtree gets vectorized, we may need to extract the values from the
671 /// roots. This method calculates the cost of extracting the values.
672 int getGatherCost(ArrayRef<Value *> VL);
673
674 /// \brief Set the Builder insert point to one after the last instruction in
675 /// the bundle
676 void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue);
677
678 /// \returns a vector from a collection of scalars in \p VL.
679 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
680
681 /// \returns whether the VectorizableTree is fully vectorizable and will
682 /// be beneficial even the tree height is tiny.
683 bool isFullyVectorizableTinyTree();
684
685 /// \reorder commutative operands in alt shuffle if they result in
686 /// vectorized code.
687 void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
688 SmallVectorImpl<Value *> &Left,
689 SmallVectorImpl<Value *> &Right);
690
691 /// \reorder commutative operands to get better probability of
692 /// generating vectorized code.
693 void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
694 SmallVectorImpl<Value *> &Left,
695 SmallVectorImpl<Value *> &Right);
696 struct TreeEntry {
697 TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
698
699 /// \returns true if the scalars in VL are equal to this entry.
700 bool isSame(ArrayRef<Value *> VL) const {
701 assert(VL.size() == Scalars.size() && "Invalid size")(static_cast <bool> (VL.size() == Scalars.size() &&
"Invalid size") ? void (0) : __assert_fail ("VL.size() == Scalars.size() && \"Invalid size\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 701, __extension__ __PRETTY_FUNCTION__))
;
702 return std::equal(VL.begin(), VL.end(), Scalars.begin());
703 }
704
705 /// A vector of scalars.
706 ValueList Scalars;
707
708 /// The Scalars are vectorized into this value. It is initialized to Null.
709 Value *VectorizedValue = nullptr;
710
711 /// Do we need to gather this sequence ?
712 bool NeedToGather = false;
713
714 /// Points back to the VectorizableTree.
715 ///
716 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
717 /// to be a pointer and needs to be able to initialize the child iterator.
718 /// Thus we need a reference back to the container to translate the indices
719 /// to entries.
720 std::vector<TreeEntry> &Container;
721
722 /// The TreeEntry index containing the user of this entry. We can actually
723 /// have multiple users so the data structure is not truly a tree.
724 SmallVector<int, 1> UserTreeIndices;
725 };
726
727 /// Create a new VectorizableTree entry.
728 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
729 int &UserTreeIdx) {
730 VectorizableTree.emplace_back(VectorizableTree);
731 int idx = VectorizableTree.size() - 1;
732 TreeEntry *Last = &VectorizableTree[idx];
733 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
734 Last->NeedToGather = !Vectorized;
735 if (Vectorized) {
736 for (int i = 0, e = VL.size(); i != e; ++i) {
737 assert(!getTreeEntry(VL[i]) && "Scalar already in tree!")(static_cast <bool> (!getTreeEntry(VL[i]) && "Scalar already in tree!"
) ? void (0) : __assert_fail ("!getTreeEntry(VL[i]) && \"Scalar already in tree!\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 737, __extension__ __PRETTY_FUNCTION__))
;
738 ScalarToTreeEntry[VL[i]] = idx;
739 }
740 } else {
741 MustGather.insert(VL.begin(), VL.end());
742 }
743
744 if (UserTreeIdx >= 0)
745 Last->UserTreeIndices.push_back(UserTreeIdx);
746 UserTreeIdx = idx;
747 return Last;
748 }
749
750 /// -- Vectorization State --
751 /// Holds all of the tree entries.
752 std::vector<TreeEntry> VectorizableTree;
753
754 TreeEntry *getTreeEntry(Value *V) {
755 auto I = ScalarToTreeEntry.find(V);
756 if (I != ScalarToTreeEntry.end())
757 return &VectorizableTree[I->second];
758 return nullptr;
759 }
760
761 const TreeEntry *getTreeEntry(Value *V) const {
762 auto I = ScalarToTreeEntry.find(V);
763 if (I != ScalarToTreeEntry.end())
764 return &VectorizableTree[I->second];
765 return nullptr;
766 }
767
768 /// Maps a specific scalar to its tree entry.
769 SmallDenseMap<Value*, int> ScalarToTreeEntry;
770
771 /// A list of scalars that we found that we need to keep as scalars.
772 ValueSet MustGather;
773
774 /// This POD struct describes one external user in the vectorized tree.
775 struct ExternalUser {
776 ExternalUser(Value *S, llvm::User *U, int L)
777 : Scalar(S), User(U), Lane(L) {}
778
779 // Which scalar in our function.
780 Value *Scalar;
781
782 // Which user that uses the scalar.
783 llvm::User *User;
784
785 // Which lane does the scalar belong to.
786 int Lane;
787 };
788 using UserList = SmallVector<ExternalUser, 16>;
789
790 /// Checks if two instructions may access the same memory.
791 ///
792 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
793 /// is invariant in the calling loop.
794 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
795 Instruction *Inst2) {
796 // First check if the result is already in the cache.
797 AliasCacheKey key = std::make_pair(Inst1, Inst2);
798 Optional<bool> &result = AliasCache[key];
799 if (result.hasValue()) {
800 return result.getValue();
801 }
802 MemoryLocation Loc2 = getLocation(Inst2, AA);
803 bool aliased = true;
804 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
805 // Do the alias check.
806 aliased = AA->alias(Loc1, Loc2);
807 }
808 // Store the result in the cache.
809 result = aliased;
810 return aliased;
811 }
812
813 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
814
815 /// Cache for alias results.
816 /// TODO: consider moving this to the AliasAnalysis itself.
817 DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
818
819 /// Removes an instruction from its block and eventually deletes it.
820 /// It's like Instruction::eraseFromParent() except that the actual deletion
821 /// is delayed until BoUpSLP is destructed.
822 /// This is required to ensure that there are no incorrect collisions in the
823 /// AliasCache, which can happen if a new instruction is allocated at the
824 /// same address as a previously deleted instruction.
825 void eraseInstruction(Instruction *I) {
826 I->removeFromParent();
827 I->dropAllReferences();
828 DeletedInstructions.emplace_back(I);
829 }
830
831 /// Temporary store for deleted instructions. Instructions will be deleted
832 /// eventually when the BoUpSLP is destructed.
833 SmallVector<unique_value, 8> DeletedInstructions;
834
835 /// A list of values that need to extracted out of the tree.
836 /// This list holds pairs of (Internal Scalar : External User). External User
837 /// can be nullptr, it means that this Internal Scalar will be used later,
838 /// after vectorization.
839 UserList ExternalUses;
840
841 /// Values used only by @llvm.assume calls.
842 SmallPtrSet<const Value *, 32> EphValues;
843
844 /// Holds all of the instructions that we gathered.
845 SetVector<Instruction *> GatherSeq;
846
847 /// A list of blocks that we are going to CSE.
848 SetVector<BasicBlock *> CSEBlocks;
849
850 /// Contains all scheduling relevant data for an instruction.
851 /// A ScheduleData either represents a single instruction or a member of an
852 /// instruction bundle (= a group of instructions which is combined into a
853 /// vector instruction).
854 struct ScheduleData {
855 // The initial value for the dependency counters. It means that the
856 // dependencies are not calculated yet.
857 enum { InvalidDeps = -1 };
858
859 ScheduleData() = default;
860
861 void init(int BlockSchedulingRegionID, Value *OpVal) {
862 FirstInBundle = this;
863 NextInBundle = nullptr;
864 NextLoadStore = nullptr;
865 IsScheduled = false;
866 SchedulingRegionID = BlockSchedulingRegionID;
867 UnscheduledDepsInBundle = UnscheduledDeps;
868 clearDependencies();
869 OpValue = OpVal;
870 }
871
872 /// Returns true if the dependency information has been calculated.
873 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
874
875 /// Returns true for single instructions and for bundle representatives
876 /// (= the head of a bundle).
877 bool isSchedulingEntity() const { return FirstInBundle == this; }
878
879 /// Returns true if it represents an instruction bundle and not only a
880 /// single instruction.
881 bool isPartOfBundle() const {
882 return NextInBundle != nullptr || FirstInBundle != this;
883 }
884
885 /// Returns true if it is ready for scheduling, i.e. it has no more
886 /// unscheduled depending instructions/bundles.
887 bool isReady() const {
888 assert(isSchedulingEntity() &&(static_cast <bool> (isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 889, __extension__ __PRETTY_FUNCTION__))
889 "can't consider non-scheduling entity for ready list")(static_cast <bool> (isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 889, __extension__ __PRETTY_FUNCTION__))
;
890 return UnscheduledDepsInBundle == 0 && !IsScheduled;
891 }
892
893 /// Modifies the number of unscheduled dependencies, also updating it for
894 /// the whole bundle.
895 int incrementUnscheduledDeps(int Incr) {
896 UnscheduledDeps += Incr;
897 return FirstInBundle->UnscheduledDepsInBundle += Incr;
898 }
899
900 /// Sets the number of unscheduled dependencies to the number of
901 /// dependencies.
902 void resetUnscheduledDeps() {
903 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
904 }
905
906 /// Clears all dependency information.
907 void clearDependencies() {
908 Dependencies = InvalidDeps;
909 resetUnscheduledDeps();
910 MemoryDependencies.clear();
911 }
912
913 void dump(raw_ostream &os) const {
914 if (!isSchedulingEntity()) {
915 os << "/ " << *Inst;
916 } else if (NextInBundle) {
917 os << '[' << *Inst;
918 ScheduleData *SD = NextInBundle;
919 while (SD) {
920 os << ';' << *SD->Inst;
921 SD = SD->NextInBundle;
922 }
923 os << ']';
924 } else {
925 os << *Inst;
926 }
927 }
928
929 Instruction *Inst = nullptr;
930
931 /// Points to the head in an instruction bundle (and always to this for
932 /// single instructions).
933 ScheduleData *FirstInBundle = nullptr;
934
935 /// Single linked list of all instructions in a bundle. Null if it is a
936 /// single instruction.
937 ScheduleData *NextInBundle = nullptr;
938
939 /// Single linked list of all memory instructions (e.g. load, store, call)
940 /// in the block - until the end of the scheduling region.
941 ScheduleData *NextLoadStore = nullptr;
942
943 /// The dependent memory instructions.
944 /// This list is derived on demand in calculateDependencies().
945 SmallVector<ScheduleData *, 4> MemoryDependencies;
946
947 /// This ScheduleData is in the current scheduling region if this matches
948 /// the current SchedulingRegionID of BlockScheduling.
949 int SchedulingRegionID = 0;
950
951 /// Used for getting a "good" final ordering of instructions.
952 int SchedulingPriority = 0;
953
954 /// The number of dependencies. Constitutes of the number of users of the
955 /// instruction plus the number of dependent memory instructions (if any).
956 /// This value is calculated on demand.
957 /// If InvalidDeps, the number of dependencies is not calculated yet.
958 int Dependencies = InvalidDeps;
959
960 /// The number of dependencies minus the number of dependencies of scheduled
961 /// instructions. As soon as this is zero, the instruction/bundle gets ready
962 /// for scheduling.
963 /// Note that this is negative as long as Dependencies is not calculated.
964 int UnscheduledDeps = InvalidDeps;
965
966 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
967 /// single instructions.
968 int UnscheduledDepsInBundle = InvalidDeps;
969
970 /// True if this instruction is scheduled (or considered as scheduled in the
971 /// dry-run).
972 bool IsScheduled = false;
973
974 /// Opcode of the current instruction in the schedule data.
975 Value *OpValue = nullptr;
976 };
977
978#ifndef NDEBUG
979 friend inline raw_ostream &operator<<(raw_ostream &os,
980 const BoUpSLP::ScheduleData &SD) {
981 SD.dump(os);
982 return os;
983 }
984#endif
985
986 friend struct GraphTraits<BoUpSLP *>;
987 friend struct DOTGraphTraits<BoUpSLP *>;
988
989 /// Contains all scheduling data for a basic block.
990 struct BlockScheduling {
991 BlockScheduling(BasicBlock *BB)
992 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
993
994 void clear() {
995 ReadyInsts.clear();
996 ScheduleStart = nullptr;
997 ScheduleEnd = nullptr;
998 FirstLoadStoreInRegion = nullptr;
999 LastLoadStoreInRegion = nullptr;
1000
1001 // Reduce the maximum schedule region size by the size of the
1002 // previous scheduling run.
1003 ScheduleRegionSizeLimit -= ScheduleRegionSize;
1004 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
1005 ScheduleRegionSizeLimit = MinScheduleRegionSize;
1006 ScheduleRegionSize = 0;
1007
1008 // Make a new scheduling region, i.e. all existing ScheduleData is not
1009 // in the new region yet.
1010 ++SchedulingRegionID;
1011 }
1012
1013 ScheduleData *getScheduleData(Value *V) {
1014 ScheduleData *SD = ScheduleDataMap[V];
1015 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
1016 return SD;
1017 return nullptr;
1018 }
1019
1020 ScheduleData *getScheduleData(Value *V, Value *Key) {
1021 if (V == Key)
1022 return getScheduleData(V);
1023 auto I = ExtraScheduleDataMap.find(V);
1024 if (I != ExtraScheduleDataMap.end()) {
1025 ScheduleData *SD = I->second[Key];
1026 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
1027 return SD;
1028 }
1029 return nullptr;
1030 }
1031
1032 bool isInSchedulingRegion(ScheduleData *SD) {
1033 return SD->SchedulingRegionID == SchedulingRegionID;
1034 }
1035
1036 /// Marks an instruction as scheduled and puts all dependent ready
1037 /// instructions into the ready-list.
1038 template <typename ReadyListType>
1039 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
1040 SD->IsScheduled = true;
1041 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule " << *SD <<
"\n"; } } while (false)
;
1042
1043 ScheduleData *BundleMember = SD;
1044 while (BundleMember) {
1045 if (BundleMember->Inst != BundleMember->OpValue) {
1046 BundleMember = BundleMember->NextInBundle;
1047 continue;
1048 }
1049 // Handle the def-use chain dependencies.
1050 for (Use &U : BundleMember->Inst->operands()) {
1051 auto *I = dyn_cast<Instruction>(U.get());
1052 if (!I)
1053 continue;
1054 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
1055 if (OpDef && OpDef->hasValidDependencies() &&
1056 OpDef->incrementUnscheduledDeps(-1) == 0) {
1057 // There are no more unscheduled dependencies after
1058 // decrementing, so we can put the dependent instruction
1059 // into the ready list.
1060 ScheduleData *DepBundle = OpDef->FirstInBundle;
1061 assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1062, __extension__ __PRETTY_FUNCTION__))
1062 "already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1062, __extension__ __PRETTY_FUNCTION__))
;
1063 ReadyList.insert(DepBundle);
1064 DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)
1065 << "SLP: gets ready (def): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)
;
1066 }
1067 });
1068 }
1069 // Handle the memory dependencies.
1070 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
1071 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
1072 // There are no more unscheduled dependencies after decrementing,
1073 // so we can put the dependent instruction into the ready list.
1074 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
1075 assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1076, __extension__ __PRETTY_FUNCTION__))
1076 "already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1076, __extension__ __PRETTY_FUNCTION__))
;
1077 ReadyList.insert(DepBundle);
1078 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundledo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)
1079 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)
;
1080 }
1081 }
1082 BundleMember = BundleMember->NextInBundle;
1083 }
1084 }
1085
1086 void doForAllOpcodes(Value *V,
1087 function_ref<void(ScheduleData *SD)> Action) {
1088 if (ScheduleData *SD = getScheduleData(V))
1089 Action(SD);
1090 auto I = ExtraScheduleDataMap.find(V);
1091 if (I != ExtraScheduleDataMap.end())
1092 for (auto &P : I->second)
1093 if (P.second->SchedulingRegionID == SchedulingRegionID)
1094 Action(P.second);
1095 }
1096
1097 /// Put all instructions into the ReadyList which are ready for scheduling.
1098 template <typename ReadyListType>
1099 void initialFillReadyList(ReadyListType &ReadyList) {
1100 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
1101 doForAllOpcodes(I, [&](ScheduleData *SD) {
1102 if (SD->isSchedulingEntity() && SD->isReady()) {
1103 ReadyList.insert(SD);
1104 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *I << "\n"; } } while (false)
;
1105 }
1106 });
1107 }
1108 }
1109
1110 /// Checks if a bundle of instructions can be scheduled, i.e. has no
1111 /// cyclic dependencies. This is only a dry-run, no instructions are
1112 /// actually moved at this stage.
1113 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
1114
1115 /// Un-bundles a group of instructions.
1116 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
1117
1118 /// Allocates schedule data chunk.
1119 ScheduleData *allocateScheduleDataChunks();
1120
1121 /// Extends the scheduling region so that V is inside the region.
1122 /// \returns true if the region size is within the limit.
1123 bool extendSchedulingRegion(Value *V, Value *OpValue);
1124
1125 /// Initialize the ScheduleData structures for new instructions in the
1126 /// scheduling region.
1127 void initScheduleData(Instruction *FromI, Instruction *ToI,
1128 ScheduleData *PrevLoadStore,
1129 ScheduleData *NextLoadStore);
1130
1131 /// Updates the dependency information of a bundle and of all instructions/
1132 /// bundles which depend on the original bundle.
1133 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
1134 BoUpSLP *SLP);
1135
1136 /// Sets all instruction in the scheduling region to un-scheduled.
1137 void resetSchedule();
1138
1139 BasicBlock *BB;
1140
1141 /// Simple memory allocation for ScheduleData.
1142 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
1143
1144 /// The size of a ScheduleData array in ScheduleDataChunks.
1145 int ChunkSize;
1146
1147 /// The allocator position in the current chunk, which is the last entry
1148 /// of ScheduleDataChunks.
1149 int ChunkPos;
1150
1151 /// Attaches ScheduleData to Instruction.
1152 /// Note that the mapping survives during all vectorization iterations, i.e.
1153 /// ScheduleData structures are recycled.
1154 DenseMap<Value *, ScheduleData *> ScheduleDataMap;
1155
1156 /// Attaches ScheduleData to Instruction with the leading key.
1157 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
1158 ExtraScheduleDataMap;
1159
1160 struct ReadyList : SmallVector<ScheduleData *, 8> {
1161 void insert(ScheduleData *SD) { push_back(SD); }
1162 };
1163
1164 /// The ready-list for scheduling (only used for the dry-run).
1165 ReadyList ReadyInsts;
1166
1167 /// The first instruction of the scheduling region.
1168 Instruction *ScheduleStart = nullptr;
1169
1170 /// The first instruction _after_ the scheduling region.
1171 Instruction *ScheduleEnd = nullptr;
1172
1173 /// The first memory accessing instruction in the scheduling region
1174 /// (can be null).
1175 ScheduleData *FirstLoadStoreInRegion = nullptr;
1176
1177 /// The last memory accessing instruction in the scheduling region
1178 /// (can be null).
1179 ScheduleData *LastLoadStoreInRegion = nullptr;
1180
1181 /// The current size of the scheduling region.
1182 int ScheduleRegionSize = 0;
1183
1184 /// The maximum size allowed for the scheduling region.
1185 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
1186
1187 /// The ID of the scheduling region. For a new vectorization iteration this
1188 /// is incremented which "removes" all ScheduleData from the region.
1189 // Make sure that the initial SchedulingRegionID is greater than the
1190 // initial SchedulingRegionID in ScheduleData (which is 0).
1191 int SchedulingRegionID = 1;
1192 };
1193
1194 /// Attaches the BlockScheduling structures to basic blocks.
1195 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
1196
1197 /// Performs the "real" scheduling. Done before vectorization is actually
1198 /// performed in a basic block.
1199 void scheduleBlock(BlockScheduling *BS);
1200
1201 /// List of users to ignore during scheduling and that don't need extracting.
1202 ArrayRef<Value *> UserIgnoreList;
1203
1204 // Number of load bundles that contain consecutive loads.
1205 int NumLoadsWantToKeepOrder = 0;
1206
1207 // Number of load bundles that contain consecutive loads in reversed order.
1208 int NumLoadsWantToChangeOrder = 0;
1209
1210 // Analysis and block reference.
1211 Function *F;
1212 ScalarEvolution *SE;
1213 TargetTransformInfo *TTI;
1214 TargetLibraryInfo *TLI;
1215 AliasAnalysis *AA;
1216 LoopInfo *LI;
1217 DominatorTree *DT;
1218 AssumptionCache *AC;
1219 DemandedBits *DB;
1220 const DataLayout *DL;
1221 OptimizationRemarkEmitter *ORE;
1222
1223 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
1224 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
1225
1226 /// Instruction builder to construct the vectorized tree.
1227 IRBuilder<> Builder;
1228
1229 /// A map of scalar integer values to the smallest bit width with which they
1230 /// can legally be represented. The values map to (width, signed) pairs,
1231 /// where "width" indicates the minimum bit width and "signed" is True if the
1232 /// value must be signed-extended, rather than zero-extended, back to its
1233 /// original width.
1234 MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
1235};
1236
1237} // end namespace slpvectorizer
1238
1239template <> struct GraphTraits<BoUpSLP *> {
1240 using TreeEntry = BoUpSLP::TreeEntry;
1241
1242 /// NodeRef has to be a pointer per the GraphWriter.
1243 using NodeRef = TreeEntry *;
1244
1245 /// \brief Add the VectorizableTree to the index iterator to be able to return
1246 /// TreeEntry pointers.
1247 struct ChildIteratorType
1248 : public iterator_adaptor_base<ChildIteratorType,
1249 SmallVector<int, 1>::iterator> {
1250 std::vector<TreeEntry> &VectorizableTree;
1251
1252 ChildIteratorType(SmallVector<int, 1>::iterator W,
1253 std::vector<TreeEntry> &VT)
1254 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
1255
1256 NodeRef operator*() { return &VectorizableTree[*I]; }
1257 };
1258
1259 static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
1260
1261 static ChildIteratorType child_begin(NodeRef N) {
1262 return {N->UserTreeIndices.begin(), N->Container};
1263 }
1264
1265 static ChildIteratorType child_end(NodeRef N) {
1266 return {N->UserTreeIndices.end(), N->Container};
1267 }
1268
1269 /// For the node iterator we just need to turn the TreeEntry iterator into a
1270 /// TreeEntry* iterator so that it dereferences to NodeRef.
1271 using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;
1272
1273 static nodes_iterator nodes_begin(BoUpSLP *R) {
1274 return nodes_iterator(R->VectorizableTree.begin());
1275 }
1276
1277 static nodes_iterator nodes_end(BoUpSLP *R) {
1278 return nodes_iterator(R->VectorizableTree.end());
1279 }
1280
1281 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
1282};
1283
1284template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
1285 using TreeEntry = BoUpSLP::TreeEntry;
1286
1287 DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
1288
1289 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
1290 std::string Str;
1291 raw_string_ostream OS(Str);
1292 if (isSplat(Entry->Scalars)) {
1293 OS << "<splat> " << *Entry->Scalars[0];
1294 return Str;
1295 }
1296 for (auto V : Entry->Scalars) {
1297 OS << *V;
1298 if (std::any_of(
1299 R->ExternalUses.begin(), R->ExternalUses.end(),
1300 [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
1301 OS << " <extract>";
1302 OS << "\n";
1303 }
1304 return Str;
1305 }
1306
1307 static std::string getNodeAttributes(const TreeEntry *Entry,
1308 const BoUpSLP *) {
1309 if (Entry->NeedToGather)
1310 return "color=red";
1311 return "";
1312 }
1313};
1314
1315} // end namespace llvm
1316
1317void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
1318 ArrayRef<Value *> UserIgnoreLst) {
1319 ExtraValueToDebugLocsMap ExternallyUsedValues;
1320 buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
1321}
1322
1323void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
1324 ExtraValueToDebugLocsMap &ExternallyUsedValues,
1325 ArrayRef<Value *> UserIgnoreLst) {
1326 deleteTree();
1327 UserIgnoreList = UserIgnoreLst;
1328 if (!allSameType(Roots))
1329 return;
1330 buildTree_rec(Roots, 0, -1);
1331
1332 // Collect the values that we need to extract from the tree.
1333 for (TreeEntry &EIdx : VectorizableTree) {
1334 TreeEntry *Entry = &EIdx;
1335
1336 // No need to handle users of gathered values.
1337 if (Entry->NeedToGather)
1338 continue;
1339
1340 // For each lane:
1341 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
1342 Value *Scalar = Entry->Scalars[Lane];
1343
1344 // Check if the scalar is externally used as an extra arg.
1345 auto ExtI = ExternallyUsedValues.find(Scalar);
1346 if (ExtI != ExternallyUsedValues.end()) {
1347 DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)
1348 Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)
;
1349 ExternalUses.emplace_back(Scalar, nullptr, Lane);
1350 continue;
1351 }
1352 for (User *U : Scalar->users()) {
1353 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Checking user:" << *U <<
".\n"; } } while (false)
;
1354
1355 Instruction *UserInst = dyn_cast<Instruction>(U);
1356 if (!UserInst)
1357 continue;
1358
1359 // Skip in-tree scalars that become vectors
1360 if (TreeEntry *UseEntry = getTreeEntry(U)) {
1361 Value *UseScalar = UseEntry->Scalars[0];
1362 // Some in-tree scalars will remain as scalar in vectorized
1363 // instructions. If that is the case, the one in Lane 0 will
1364 // be used.
1365 if (UseScalar != U ||
1366 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
1367 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *Udo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)
1368 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)
;
1369 assert(!UseEntry->NeedToGather && "Bad state")(static_cast <bool> (!UseEntry->NeedToGather &&
"Bad state") ? void (0) : __assert_fail ("!UseEntry->NeedToGather && \"Bad state\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1369, __extension__ __PRETTY_FUNCTION__))
;
1370 continue;
1371 }
1372 }
1373
1374 // Ignore users in the user ignore list.
1375 if (is_contained(UserIgnoreList, UserInst))
1376 continue;
1377
1378 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)
1379 Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)
;
1380 ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
1381 }
1382 }
1383 }
1384}
1385
1386void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
1387 int UserTreeIdx) {
1388 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!")(static_cast <bool> ((allConstant(VL) || allSameType(VL
)) && "Invalid types!") ? void (0) : __assert_fail ("(allConstant(VL) || allSameType(VL)) && \"Invalid types!\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1388, __extension__ __PRETTY_FUNCTION__))
;
1389
1390 InstructionsState S = getSameOpcode(VL);
1391 if (Depth == RecursionMaxDepth) {
1392 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to max recursion depth.\n"
; } } while (false)
;
1393 newTreeEntry(VL, false, UserTreeIdx);
1394 return;
1395 }
1396
1397 // Don't handle vectors.
1398 if (S.OpValue->getType()->isVectorTy()) {
1399 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to vector type.\n"
; } } while (false)
;
1400 newTreeEntry(VL, false, UserTreeIdx);
1401 return;
1402 }
1403
1404 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
1405 if (SI->getValueOperand()->getType()->isVectorTy()) {
1406 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to store vector type.\n"
; } } while (false)
;
1407 newTreeEntry(VL, false, UserTreeIdx);
1408 return;
1409 }
1410
1411 // If all of the operands are identical or constant we have a simple solution.
1412 if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
1413 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to C,S,B,O. \n"
; } } while (false)
;
1414 newTreeEntry(VL, false, UserTreeIdx);
1415 return;
1416 }
1417
1418 // We now know that this is a vector of instructions of the same type from
1419 // the same block.
1420
1421 // Don't vectorize ephemeral values.
1422 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1423 if (EphValues.count(VL[i])) {
1424 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is ephemeral.\n"; } } while (false)
1425 ") is ephemeral.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is ephemeral.\n"; } } while (false)
;
1426 newTreeEntry(VL, false, UserTreeIdx);
1427 return;
1428 }
1429 }
1430
1431 // Check if this is a duplicate of another entry.
1432 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
1433 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1434 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tChecking bundle: " <<
*VL[i] << ".\n"; } } while (false)
;
1435 if (E->Scalars[i] != VL[i]) {
1436 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to partial overlap.\n"
; } } while (false)
;
1437 newTreeEntry(VL, false, UserTreeIdx);
1438 return;
1439 }
1440 }
1441 // Record the reuse of the tree node. FIXME, currently this is only used to
1442 // properly draw the graph rather than for the actual vectorization.
1443 E->UserTreeIndices.push_back(UserTreeIdx);
1444 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*S.OpValue << ".\n"; } } while (false)
;
1445 return;
1446 }
1447
1448 // Check that none of the instructions in the bundle are already in the tree.
1449 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1450 auto *I = dyn_cast<Instruction>(VL[i]);
1451 if (!I)
1452 continue;
1453 if (getTreeEntry(I)) {
1454 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is already in tree.\n"; } } while (false)
1455 ") is already in tree.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is already in tree.\n"; } } while (false)
;
1456 newTreeEntry(VL, false, UserTreeIdx);
1457 return;
1458 }
1459 }
1460
1461 // If any of the scalars is marked as a value that needs to stay scalar, then
1462 // we need to gather the scalars.
1463 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1464 if (MustGather.count(VL[i])) {
1465 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to gathered scalar.\n"
; } } while (false)
;
1466 newTreeEntry(VL, false, UserTreeIdx);
1467 return;
1468 }
1469 }
1470
1471 // Check that all of the users of the scalars that we want to vectorize are
1472 // schedulable.
1473 auto *VL0 = cast<Instruction>(S.OpValue);
1474 BasicBlock *BB = VL0->getParent();
1475
1476 if (!DT->isReachableFromEntry(BB)) {
1477 // Don't go into unreachable blocks. They may contain instructions with
1478 // dependency cycles which confuse the final scheduling.
1479 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle in unreachable block.\n"
; } } while (false)
;
1480 newTreeEntry(VL, false, UserTreeIdx);
1481 return;
1482 }
1483
1484 // Check that every instruction appears once in this bundle.
1485 for (unsigned i = 0, e = VL.size(); i < e; ++i)
1486 for (unsigned j = i + 1; j < e; ++j)
1487 if (VL[i] == VL[j]) {
1488 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Scalar used twice in bundle.\n"
; } } while (false)
;
1489 newTreeEntry(VL, false, UserTreeIdx);
1490 return;
1491 }
1492
1493 auto &BSRef = BlocksSchedules[BB];
1494 if (!BSRef)
1495 BSRef = llvm::make_unique<BlockScheduling>(BB);
1496
1497 BlockScheduling &BS = *BSRef.get();
1498
1499 if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
1500 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are not able to schedule this bundle!\n"
; } } while (false)
;
1501 assert((!BS.getScheduleData(VL0) ||(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1503, __extension__ __PRETTY_FUNCTION__))
1502 !BS.getScheduleData(VL0)->isPartOfBundle()) &&(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1503, __extension__ __PRETTY_FUNCTION__))
1503 "tryScheduleBundle should cancelScheduling on failure")(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1503, __extension__ __PRETTY_FUNCTION__))
;
1504 newTreeEntry(VL, false, UserTreeIdx);
1505 return;
1506 }
1507 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are able to schedule this bundle.\n"
; } } while (false)
;
1508
1509 unsigned ShuffleOrOp = S.IsAltShuffle ?
1510 (unsigned) Instruction::ShuffleVector : S.Opcode;
1511 switch (ShuffleOrOp) {
1512 case Instruction::PHI: {
1513 PHINode *PH = dyn_cast<PHINode>(VL0);
1514
1515 // Check for terminator values (e.g. invoke).
1516 for (unsigned j = 0; j < VL.size(); ++j)
1517 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1518 TerminatorInst *Term = dyn_cast<TerminatorInst>(
1519 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1520 if (Term) {
1521 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"
; } } while (false)
;
1522 BS.cancelScheduling(VL, VL0);
1523 newTreeEntry(VL, false, UserTreeIdx);
1524 return;
1525 }
1526 }
1527
1528 newTreeEntry(VL, true, UserTreeIdx);
1529 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of PHINodes.\n"
; } } while (false)
;
1530
1531 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1532 ValueList Operands;
1533 // Prepare the operand vector.
1534 for (Value *j : VL)
1535 Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
1536 PH->getIncomingBlock(i)));
1537
1538 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1539 }
1540 return;
1541 }
1542 case Instruction::ExtractValue:
1543 case Instruction::ExtractElement: {
1544 bool Reuse = canReuseExtract(VL, VL0);
1545 if (Reuse) {
1546 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Reusing extract sequence.\n"
; } } while (false)
;
1547 } else {
1548 BS.cancelScheduling(VL, VL0);
1549 }
1550 newTreeEntry(VL, Reuse, UserTreeIdx);
1551 return;
1552 }
1553 case Instruction::Load: {
1554 // Check that a vectorized load would load the same memory as a scalar
1555 // load. For example, we don't want to vectorize loads that are smaller
1556 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
1557 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
1558 // from such a struct, we read/write packed bits disagreeing with the
1559 // unvectorized version.
1560 Type *ScalarTy = VL0->getType();
1561
1562 if (DL->getTypeSizeInBits(ScalarTy) !=
1563 DL->getTypeAllocSizeInBits(ScalarTy)) {
1564 BS.cancelScheduling(VL, VL0);
1565 newTreeEntry(VL, false, UserTreeIdx);
1566 DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering loads of non-packed type.\n"
; } } while (false)
;
1567 return;
1568 }
1569
1570 // Make sure all loads in the bundle are simple - we can't vectorize
1571 // atomic or volatile loads.
1572 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1573 LoadInst *L = cast<LoadInst>(VL[i]);
1574 if (!L->isSimple()) {
1575 BS.cancelScheduling(VL, VL0);
1576 newTreeEntry(VL, false, UserTreeIdx);
1577 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple loads.\n"
; } } while (false)
;
1578 return;
1579 }
1580 }
1581
1582 // Check if the loads are consecutive, reversed, or neither.
1583 // TODO: What we really want is to sort the loads, but for now, check
1584 // the two likely directions.
1585 bool Consecutive = true;
1586 bool ReverseConsecutive = true;
1587 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1588 if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
1589 Consecutive = false;
1590 break;
1591 } else {
1592 ReverseConsecutive = false;
1593 }
1594 }
1595
1596 if (Consecutive) {
1597 ++NumLoadsWantToKeepOrder;
1598 newTreeEntry(VL, true, UserTreeIdx);
1599 DEBUG(dbgs() << "SLP: added a vector of loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of loads.\n";
} } while (false)
;
1600 return;
1601 }
1602
1603 // If none of the load pairs were consecutive when checked in order,
1604 // check the reverse order.
1605 if (ReverseConsecutive)
1606 for (unsigned i = VL.size() - 1; i > 0; --i)
1607 if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
1608 ReverseConsecutive = false;
1609 break;
1610 }
1611
1612 BS.cancelScheduling(VL, VL0);
1613 newTreeEntry(VL, false, UserTreeIdx);
1614
1615 if (ReverseConsecutive) {
1616 ++NumLoadsWantToChangeOrder;
1617 DEBUG(dbgs() << "SLP: Gathering reversed loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering reversed loads.\n"
; } } while (false)
;
1618 } else {
1619 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-consecutive loads.\n"
; } } while (false)
;
1620 }
1621 return;
1622 }
1623 case Instruction::ZExt:
1624 case Instruction::SExt:
1625 case Instruction::FPToUI:
1626 case Instruction::FPToSI:
1627 case Instruction::FPExt:
1628 case Instruction::PtrToInt:
1629 case Instruction::IntToPtr:
1630 case Instruction::SIToFP:
1631 case Instruction::UIToFP:
1632 case Instruction::Trunc:
1633 case Instruction::FPTrunc:
1634 case Instruction::BitCast: {
1635 Type *SrcTy = VL0->getOperand(0)->getType();
1636 for (unsigned i = 0; i < VL.size(); ++i) {
1637 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1638 if (Ty != SrcTy || !isValidElementType(Ty)) {
1639 BS.cancelScheduling(VL, VL0);
1640 newTreeEntry(VL, false, UserTreeIdx);
1641 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false)
;
1642 return;
1643 }
1644 }
1645 newTreeEntry(VL, true, UserTreeIdx);
1646 DEBUG(dbgs() << "SLP: added a vector of casts.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of casts.\n";
} } while (false)
;
1647
1648 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1649 ValueList Operands;
1650 // Prepare the operand vector.
1651 for (Value *j : VL)
1652 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1653
1654 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1655 }
1656 return;
1657 }
1658 case Instruction::ICmp:
1659 case Instruction::FCmp: {
1660 // Check that all of the compares have the same predicate.
1661 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
1662 Type *ComparedTy = VL0->getOperand(0)->getType();
1663 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1664 CmpInst *Cmp = cast<CmpInst>(VL[i]);
1665 if (Cmp->getPredicate() != P0 ||
1666 Cmp->getOperand(0)->getType() != ComparedTy) {
1667 BS.cancelScheduling(VL, VL0);
1668 newTreeEntry(VL, false, UserTreeIdx);
1669 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false)
;
1670 return;
1671 }
1672 }
1673
1674 newTreeEntry(VL, true, UserTreeIdx);
1675 DEBUG(dbgs() << "SLP: added a vector of compares.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of compares.\n"
; } } while (false)
;
1676
1677 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1678 ValueList Operands;
1679 // Prepare the operand vector.
1680 for (Value *j : VL)
1681 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1682
1683 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1684 }
1685 return;
1686 }
1687 case Instruction::Select:
1688 case Instruction::Add:
1689 case Instruction::FAdd:
1690 case Instruction::Sub:
1691 case Instruction::FSub:
1692 case Instruction::Mul:
1693 case Instruction::FMul:
1694 case Instruction::UDiv:
1695 case Instruction::SDiv:
1696 case Instruction::FDiv:
1697 case Instruction::URem:
1698 case Instruction::SRem:
1699 case Instruction::FRem:
1700 case Instruction::Shl:
1701 case Instruction::LShr:
1702 case Instruction::AShr:
1703 case Instruction::And:
1704 case Instruction::Or:
1705 case Instruction::Xor:
1706 newTreeEntry(VL, true, UserTreeIdx);
1707 DEBUG(dbgs() << "SLP: added a vector of bin op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of bin op.\n"
; } } while (false)
;
1708
1709 // Sort operands of the instructions so that each side is more likely to
1710 // have the same opcode.
1711 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1712 ValueList Left, Right;
1713 reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
1714 buildTree_rec(Left, Depth + 1, UserTreeIdx);
1715 buildTree_rec(Right, Depth + 1, UserTreeIdx);
1716 return;
1717 }
1718
1719 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1720 ValueList Operands;
1721 // Prepare the operand vector.
1722 for (Value *j : VL)
1723 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1724
1725 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1726 }
1727 return;
1728
1729 case Instruction::GetElementPtr: {
1730 // We don't combine GEPs with complicated (nested) indexing.
1731 for (unsigned j = 0; j < VL.size(); ++j) {
1732 if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1733 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"
; } } while (false)
;
1734 BS.cancelScheduling(VL, VL0);
1735 newTreeEntry(VL, false, UserTreeIdx);
1736 return;
1737 }
1738 }
1739
1740 // We can't combine several GEPs into one vector if they operate on
1741 // different types.
1742 Type *Ty0 = VL0->getOperand(0)->getType();
1743 for (unsigned j = 0; j < VL.size(); ++j) {
1744 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1745 if (Ty0 != CurTy) {
1746 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false)
;
1747 BS.cancelScheduling(VL, VL0);
1748 newTreeEntry(VL, false, UserTreeIdx);
1749 return;
1750 }
1751 }
1752
1753 // We don't combine GEPs with non-constant indexes.
1754 for (unsigned j = 0; j < VL.size(); ++j) {
1755 auto Op = cast<Instruction>(VL[j])->getOperand(1);
1756 if (!isa<ConstantInt>(Op)) {
1757 DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)
1758 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)
;
1759 BS.cancelScheduling(VL, VL0);
1760 newTreeEntry(VL, false, UserTreeIdx);
1761 return;
1762 }
1763 }
1764
1765 newTreeEntry(VL, true, UserTreeIdx);
1766 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of GEPs.\n"; }
} while (false)
;
1767 for (unsigned i = 0, e = 2; i < e; ++i) {
1768 ValueList Operands;
1769 // Prepare the operand vector.
1770 for (Value *j : VL)
1771 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1772
1773 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1774 }
1775 return;
1776 }
1777 case Instruction::Store: {
1778 // Check if the stores are consecutive or of we need to swizzle them.
1779 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1780 if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
1781 BS.cancelScheduling(VL, VL0);
1782 newTreeEntry(VL, false, UserTreeIdx);
1783 DEBUG(dbgs() << "SLP: Non-consecutive store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-consecutive store.\n"; }
} while (false)
;
1784 return;
1785 }
1786
1787 newTreeEntry(VL, true, UserTreeIdx);
1788 DEBUG(dbgs() << "SLP: added a vector of stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of stores.\n"
; } } while (false)
;
1789
1790 ValueList Operands;
1791 for (Value *j : VL)
1792 Operands.push_back(cast<Instruction>(j)->getOperand(0));
1793
1794 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1795 return;
1796 }
1797 case Instruction::Call: {
1798 // Check if the calls are all to the same vectorizable intrinsic.
1799 CallInst *CI = cast<CallInst>(VL0);
1800 // Check if this is an Intrinsic call or something that can be
1801 // represented by an intrinsic call
1802 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1803 if (!isTriviallyVectorizable(ID)) {
1804 BS.cancelScheduling(VL, VL0);
1805 newTreeEntry(VL, false, UserTreeIdx);
1806 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-vectorizable call.\n"; }
} while (false)
;
1807 return;
1808 }
1809 Function *Int = CI->getCalledFunction();
1810 Value *A1I = nullptr;
1811 if (hasVectorInstrinsicScalarOpd(ID, 1))
1812 A1I = CI->getArgOperand(1);
1813 for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1814 CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1815 if (!CI2 || CI2->getCalledFunction() != Int ||
1816 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
1817 !CI->hasIdenticalOperandBundleSchema(*CI2)) {
1818 BS.cancelScheduling(VL, VL0);
1819 newTreeEntry(VL, false, UserTreeIdx);
1820 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *VL[i] << "\n"; } } while (false
)
1821 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *VL[i] << "\n"; } } while (false
)
;
1822 return;
1823 }
1824 // ctlz,cttz and powi are special intrinsics whose second argument
1825 // should be same in order for them to be vectorized.
1826 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1827 Value *A1J = CI2->getArgOperand(1);
1828 if (A1I != A1J) {
1829 BS.cancelScheduling(VL, VL0);
1830 newTreeEntry(VL, false, UserTreeIdx);
1831 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument "<< A1I<<"!=" <<
A1J << "\n"; } } while (false)
1832 << " argument "<< A1I<<"!=" << A1Jdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument "<< A1I<<"!=" <<
A1J << "\n"; } } while (false)
1833 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument "<< A1I<<"!=" <<
A1J << "\n"; } } while (false)
;
1834 return;
1835 }
1836 }
1837 // Verify that the bundle operands are identical between the two calls.
1838 if (CI->hasOperandBundles() &&
1839 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
1840 CI->op_begin() + CI->getBundleOperandsEndIndex(),
1841 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
1842 BS.cancelScheduling(VL, VL0);
1843 newTreeEntry(VL, false, UserTreeIdx);
1844 DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *VL[i] << '\n'; } }
while (false)
1845 << *VL[i] << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *VL[i] << '\n'; } }
while (false)
;
1846 return;
1847 }
1848 }
1849
1850 newTreeEntry(VL, true, UserTreeIdx);
1851 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1852 ValueList Operands;
1853 // Prepare the operand vector.
1854 for (Value *j : VL) {
1855 CallInst *CI2 = dyn_cast<CallInst>(j);
1856 Operands.push_back(CI2->getArgOperand(i));
1857 }
1858 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1859 }
1860 return;
1861 }
1862 case Instruction::ShuffleVector:
1863 // If this is not an alternate sequence of opcode like add-sub
1864 // then do not vectorize this instruction.
1865 if (!S.IsAltShuffle) {
1866 BS.cancelScheduling(VL, VL0);
1867 newTreeEntry(VL, false, UserTreeIdx);
1868 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: ShuffleVector are not vectorized.\n"
; } } while (false)
;
1869 return;
1870 }
1871 newTreeEntry(VL, true, UserTreeIdx);
1872 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a ShuffleVector op.\n"
; } } while (false)
;
1873
1874 // Reorder operands if reordering would enable vectorization.
1875 if (isa<BinaryOperator>(VL0)) {
1876 ValueList Left, Right;
1877 reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
1878 buildTree_rec(Left, Depth + 1, UserTreeIdx);
1879 buildTree_rec(Right, Depth + 1, UserTreeIdx);
1880 return;
1881 }
1882
1883 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1884 ValueList Operands;
1885 // Prepare the operand vector.
1886 for (Value *j : VL)
1887 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1888
1889 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1890 }
1891 return;
1892
1893 default:
1894 BS.cancelScheduling(VL, VL0);
1895 newTreeEntry(VL, false, UserTreeIdx);
1896 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering unknown instruction.\n"
; } } while (false)
;
1897 return;
1898 }
1899}
1900
1901unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
1902 unsigned N;
1903 Type *EltTy;
1904 auto *ST = dyn_cast<StructType>(T);
1905 if (ST) {
1906 N = ST->getNumElements();
1907 EltTy = *ST->element_begin();
1908 } else {
1909 N = cast<ArrayType>(T)->getNumElements();
1910 EltTy = cast<ArrayType>(T)->getElementType();
1911 }
1912 if (!isValidElementType(EltTy))
1913 return 0;
1914 uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
1915 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
1916 return 0;
1917 if (ST) {
1918 // Check that struct is homogeneous.
1919 for (const auto *Ty : ST->elements())
1920 if (Ty != EltTy)
1921 return 0;
1922 }
1923 return N;
1924}
1925
1926bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
1927 Instruction *E0 = cast<Instruction>(OpValue);
1928 assert(E0->getOpcode() == Instruction::ExtractElement ||(static_cast <bool> (E0->getOpcode() == Instruction::
ExtractElement || E0->getOpcode() == Instruction::ExtractValue
) ? void (0) : __assert_fail ("E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1929, __extension__ __PRETTY_FUNCTION__))
1929 E0->getOpcode() == Instruction::ExtractValue)(static_cast <bool> (E0->getOpcode() == Instruction::
ExtractElement || E0->getOpcode() == Instruction::ExtractValue
) ? void (0) : __assert_fail ("E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1929, __extension__ __PRETTY_FUNCTION__))
;
1930 assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode")(static_cast <bool> (E0->getOpcode() == getSameOpcode
(VL).Opcode && "Invalid opcode") ? void (0) : __assert_fail
("E0->getOpcode() == getSameOpcode(VL).Opcode && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1930, __extension__ __PRETTY_FUNCTION__))
;
1931 // Check if all of the extracts come from the same vector and from the
1932 // correct offset.
1933 Value *Vec = E0->getOperand(0);
1934
1935 // We have to extract from a vector/aggregate with the same number of elements.
1936 unsigned NElts;
1937 if (E0->getOpcode() == Instruction::ExtractValue) {
1938 const DataLayout &DL = E0->getModule()->getDataLayout();
1939 NElts = canMapToVector(Vec->getType(), DL);
1940 if (!NElts)
1941 return false;
1942 // Check if load can be rewritten as load of vector.
1943 LoadInst *LI = dyn_cast<LoadInst>(Vec);
1944 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
1945 return false;
1946 } else {
1947 NElts = Vec->getType()->getVectorNumElements();
1948 }
1949
1950 if (NElts != VL.size())
1951 return false;
1952
1953 // Check that all of the indices extract from the correct offset.
1954 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
1955 Instruction *Inst = cast<Instruction>(VL[I]);
1956 if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
1957 return false;
1958 if (Inst->getOperand(0) != Vec)
1959 return false;
1960 }
1961
1962 return true;
1963}
1964
1965bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
1966 return I->hasOneUse() ||
1967 std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
1968 return ScalarToTreeEntry.count(U) > 0;
1969 });
1970}
1971
1972int BoUpSLP::getEntryCost(TreeEntry *E) {
1973 ArrayRef<Value*> VL = E->Scalars;
1974
1975 Type *ScalarTy = VL[0]->getType();
1976 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1977 ScalarTy = SI->getValueOperand()->getType();
1978 else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
1979 ScalarTy = CI->getOperand(0)->getType();
1980 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1981
1982 // If we have computed a smaller type for the expression, update VecTy so
1983 // that the costs will be accurate.
1984 if (MinBWs.count(VL[0]))
1985 VecTy = VectorType::get(
1986 IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
1987
1988 if (E->NeedToGather) {
1989 if (allConstant(VL))
1990 return 0;
1991 if (isSplat(VL)) {
1992 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1993 }
1994 if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
1995 Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
1996 if (ShuffleKind.hasValue()) {
1997 int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
1998 for (auto *V : VL) {
1999 // If all users of instruction are going to be vectorized and this
2000 // instruction itself is not going to be vectorized, consider this
2001 // instruction as dead and remove its cost from the final cost of the
2002 // vectorized tree.
2003 if (areAllUsersVectorized(cast<Instruction>(V)) &&
2004 !ScalarToTreeEntry.count(V)) {
2005 auto *IO = cast<ConstantInt>(
2006 cast<ExtractElementInst>(V)->getIndexOperand());
2007 Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
2008 IO->getZExtValue());
2009 }
2010 }
2011 return Cost;
2012 }
2013 }
2014 return getGatherCost(E->Scalars);
2015 }
2016 InstructionsState S = getSameOpcode(VL);
2017 assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL")(static_cast <bool> (S.Opcode && allSameType(VL
) && allSameBlock(VL) && "Invalid VL") ? void
(0) : __assert_fail ("S.Opcode && allSameType(VL) && allSameBlock(VL) && \"Invalid VL\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2017, __extension__ __PRETTY_FUNCTION__))
;
2018 Instruction *VL0 = cast<Instruction>(S.OpValue);
2019 unsigned ShuffleOrOp = S.IsAltShuffle ?
2020 (unsigned) Instruction::ShuffleVector : S.Opcode;
2021 switch (ShuffleOrOp) {
2022 case Instruction::PHI:
2023 return 0;
2024
2025 case Instruction::ExtractValue:
2026 case Instruction::ExtractElement:
2027 if (canReuseExtract(VL, S.OpValue)) {
2028 int DeadCost = 0;
2029 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
2030 Instruction *E = cast<Instruction>(VL[i]);
2031 // If all users are going to be vectorized, instruction can be
2032 // considered as dead.
2033 // The same, if have only one user, it will be vectorized for sure.
2034 if (areAllUsersVectorized(E))
2035 // Take credit for instruction that will become dead.
2036 DeadCost +=
2037 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
2038 }
2039 return -DeadCost;
2040 }
2041 return getGatherCost(VecTy);
2042
2043 case Instruction::ZExt:
2044 case Instruction::SExt:
2045 case Instruction::FPToUI:
2046 case Instruction::FPToSI:
2047 case Instruction::FPExt:
2048 case Instruction::PtrToInt:
2049 case Instruction::IntToPtr:
2050 case Instruction::SIToFP:
2051 case Instruction::UIToFP:
2052 case Instruction::Trunc:
2053 case Instruction::FPTrunc:
2054 case Instruction::BitCast: {
2055 Type *SrcTy = VL0->getOperand(0)->getType();
2056
2057 // Calculate the cost of this instruction.
2058 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
2059 VL0->getType(), SrcTy, VL0);
2060
2061 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
2062 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
2063 return VecCost - ScalarCost;
2064 }
2065 case Instruction::FCmp:
2066 case Instruction::ICmp:
2067 case Instruction::Select: {
2068 // Calculate the cost of this instruction.
2069 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
2070 int ScalarCost = VecTy->getNumElements() *
2071 TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
2072 int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
2073 return VecCost - ScalarCost;
2074 }
2075 case Instruction::Add:
2076 case Instruction::FAdd:
2077 case Instruction::Sub:
2078 case Instruction::FSub:
2079 case Instruction::Mul:
2080 case Instruction::FMul:
2081 case Instruction::UDiv:
2082 case Instruction::SDiv:
2083 case Instruction::FDiv:
2084 case Instruction::URem:
2085 case Instruction::SRem:
2086 case Instruction::FRem:
2087 case Instruction::Shl:
2088 case Instruction::LShr:
2089 case Instruction::AShr:
2090 case Instruction::And:
2091 case Instruction::Or:
2092 case Instruction::Xor: {
2093 // Certain instructions can be cheaper to vectorize if they have a
2094 // constant second vector operand.
2095 TargetTransformInfo::OperandValueKind Op1VK =
2096 TargetTransformInfo::OK_AnyValue;
2097 TargetTransformInfo::OperandValueKind Op2VK =
2098 TargetTransformInfo::OK_UniformConstantValue;
2099 TargetTransformInfo::OperandValueProperties Op1VP =
2100 TargetTransformInfo::OP_None;
2101 TargetTransformInfo::OperandValueProperties Op2VP =
2102 TargetTransformInfo::OP_None;
2103
2104 // If all operands are exactly the same ConstantInt then set the
2105 // operand kind to OK_UniformConstantValue.
2106 // If instead not all operands are constants, then set the operand kind
2107 // to OK_AnyValue. If all operands are constants but not the same,
2108 // then set the operand kind to OK_NonUniformConstantValue.
2109 ConstantInt *CInt = nullptr;
2110 for (unsigned i = 0; i < VL.size(); ++i) {
2111 const Instruction *I = cast<Instruction>(VL[i]);
2112 if (!isa<ConstantInt>(I->getOperand(1))) {
2113 Op2VK = TargetTransformInfo::OK_AnyValue;
2114 break;
2115 }
2116 if (i == 0) {
2117 CInt = cast<ConstantInt>(I->getOperand(1));
2118 continue;
2119 }
2120 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
2121 CInt != cast<ConstantInt>(I->getOperand(1)))
2122 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
2123 }
2124 // FIXME: Currently cost of model modification for division by power of
2125 // 2 is handled for X86 and AArch64. Add support for other targets.
2126 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
2127 CInt->getValue().isPowerOf2())
2128 Op2VP = TargetTransformInfo::OP_PowerOf2;
2129
2130 SmallVector<const Value *, 4> Operands(VL0->operand_values());
2131 int ScalarCost =
2132 VecTy->getNumElements() *
2133 TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
2134 Op2VP, Operands);
2135 int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
2136 Op1VP, Op2VP, Operands);
2137 return VecCost - ScalarCost;
2138 }
2139 case Instruction::GetElementPtr: {
2140 TargetTransformInfo::OperandValueKind Op1VK =
2141 TargetTransformInfo::OK_AnyValue;
2142 TargetTransformInfo::OperandValueKind Op2VK =
2143 TargetTransformInfo::OK_UniformConstantValue;
2144
2145 int ScalarCost =
2146 VecTy->getNumElements() *
2147 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
2148 int VecCost =
2149 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
2150
2151 return VecCost - ScalarCost;
2152 }
2153 case Instruction::Load: {
2154 // Cost of wide load - cost of scalar loads.
2155 unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
2156 int ScalarLdCost = VecTy->getNumElements() *
2157 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
2158 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
2159 VecTy, alignment, 0, VL0);
2160 return VecLdCost - ScalarLdCost;
2161 }
2162 case Instruction::Store: {
2163 // We know that we can merge the stores. Calculate the cost.
2164 unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
2165 int ScalarStCost = VecTy->getNumElements() *
2166 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
2167 int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
2168 VecTy, alignment, 0, VL0);
2169 return VecStCost - ScalarStCost;
2170 }
2171 case Instruction::Call: {
2172 CallInst *CI = cast<CallInst>(VL0);
2173 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2174
2175 // Calculate the cost of the scalar and vector calls.
2176 SmallVector<Type*, 4> ScalarTys;
2177 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
2178 ScalarTys.push_back(CI->getArgOperand(op)->getType());
2179
2180 FastMathFlags FMF;
2181 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2182 FMF = FPMO->getFastMathFlags();
2183
2184 int ScalarCallCost = VecTy->getNumElements() *
2185 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
2186
2187 SmallVector<Value *, 4> Args(CI->arg_operands());
2188 int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
2189 VecTy->getNumElements());
2190
2191 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost "<< VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
2192 << " (" << VecCallCost << "-" << ScalarCallCost << ")"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost "<< VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
2193 << " for " << *CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost "<< VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
;
2194
2195 return VecCallCost - ScalarCallCost;
2196 }
2197 case Instruction::ShuffleVector: {
2198 TargetTransformInfo::OperandValueKind Op1VK =
2199 TargetTransformInfo::OK_AnyValue;
2200 TargetTransformInfo::OperandValueKind Op2VK =
2201 TargetTransformInfo::OK_AnyValue;
2202 int ScalarCost = 0;
2203 int VecCost = 0;
2204 for (Value *i : VL) {
2205 Instruction *I = cast<Instruction>(i);
2206 if (!I)
2207 break;
2208 ScalarCost +=
2209 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
2210 }
2211 // VecCost is equal to sum of the cost of creating 2 vectors
2212 // and the cost of creating shuffle.
2213 Instruction *I0 = cast<Instruction>(VL[0]);
2214 VecCost =
2215 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
2216 Instruction *I1 = cast<Instruction>(VL[1]);
2217 VecCost +=
2218 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
2219 VecCost +=
2220 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
2221 return VecCost - ScalarCost;
2222 }
2223 default:
2224 llvm_unreachable("Unknown instruction")::llvm::llvm_unreachable_internal("Unknown instruction", "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2224)
;
2225 }
2226}
2227
2228bool BoUpSLP::isFullyVectorizableTinyTree() {
2229 DEBUG(dbgs() << "SLP: Check whether the tree with height " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false)
2230 VectorizableTree.size() << " is fully vectorizable .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false)
;
2231
2232 // We only handle trees of heights 1 and 2.
2233 if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
2234 return true;
2235
2236 if (VectorizableTree.size() != 2)
2237 return false;
2238
2239 // Handle splat and all-constants stores.
2240 if (!VectorizableTree[0].NeedToGather &&
2241 (allConstant(VectorizableTree[1].Scalars) ||
2242 isSplat(VectorizableTree[1].Scalars)))
2243 return true;
2244
2245 // Gathering cost would be too much for tiny trees.
2246 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
2247 return false;
2248
2249 return true;
2250}
2251
2252bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
2253 // We can vectorize the tree if its size is greater than or equal to the
2254 // minimum size specified by the MinTreeSize command line option.
2255 if (VectorizableTree.size() >= MinTreeSize)
2256 return false;
2257
2258 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
2259 // can vectorize it if we can prove it fully vectorizable.
2260 if (isFullyVectorizableTinyTree())
2261 return false;
2262
2263 assert(VectorizableTree.empty()(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2265, __extension__ __PRETTY_FUNCTION__))
2264 ? ExternalUses.empty()(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2265, __extension__ __PRETTY_FUNCTION__))
2265 : true && "We shouldn't have any external users")(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2265, __extension__ __PRETTY_FUNCTION__))
;
2266
2267 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
2268 // vectorizable.
2269 return true;
2270}
2271
2272int BoUpSLP::getSpillCost() {
2273 // Walk from the bottom of the tree to the top, tracking which values are
2274 // live. When we see a call instruction that is not part of our tree,
2275 // query TTI to see if there is a cost to keeping values live over it
2276 // (for example, if spills and fills are required).
2277 unsigned BundleWidth = VectorizableTree.front().Scalars.size();
2278 int Cost = 0;
2279
2280 SmallPtrSet<Instruction*, 4> LiveValues;
2281 Instruction *PrevInst = nullptr;
2282
2283 for (const auto &N : VectorizableTree) {
2284 Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
2285 if (!Inst)
2286 continue;
2287
2288 if (!PrevInst) {
2289 PrevInst = Inst;
2290 continue;
2291 }
2292
2293 // Update LiveValues.
2294 LiveValues.erase(PrevInst);
2295 for (auto &J : PrevInst->operands()) {
2296 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
2297 LiveValues.insert(cast<Instruction>(&*J));
2298 }
2299
2300 DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2301 dbgs() << "SLP: #LV: " << LiveValues.size();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2302 for (auto *X : LiveValues)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2303 dbgs() << " " << X->getName();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2304 dbgs() << ", Looking at ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2305 Inst->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2306 )do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
;
2307
2308 // Now find the sequence of instructions between PrevInst and Inst.
2309 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
2310 PrevInstIt =
2311 PrevInst->getIterator().getReverse();
2312 while (InstIt != PrevInstIt) {
2313 if (PrevInstIt == PrevInst->getParent()->rend()) {
2314 PrevInstIt = Inst->getParent()->rbegin();
2315 continue;
2316 }
2317
2318 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
2319 SmallVector<Type*, 4> V;
2320 for (auto *II : LiveValues)
2321 V.push_back(VectorType::get(II->getType(), BundleWidth));
2322 Cost += TTI->getCostOfKeepingLiveOverCall(V);
2323 }
2324
2325 ++PrevInstIt;
2326 }
2327
2328 PrevInst = Inst;
2329 }
2330
2331 return Cost;
2332}
2333
2334int BoUpSLP::getTreeCost() {
2335 int Cost = 0;
2336 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n"; } } while (
false)
2337 VectorizableTree.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n"; } } while (
false)
;
2338
2339 unsigned BundleWidth = VectorizableTree[0].Scalars.size();
2340
2341 for (TreeEntry &TE : VectorizableTree) {
2342 int C = getEntryCost(&TE);
2343 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n"; } } while (false)
2344 << *TE.Scalars[0] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n"; } } while (false)
;
2345 Cost += C;
2346 }
2347
2348 SmallSet<Value *, 16> ExtractCostCalculated;
2349 int ExtractCost = 0;
2350 for (ExternalUser &EU : ExternalUses) {
2351 // We only add extract cost once for the same scalar.
2352 if (!ExtractCostCalculated.insert(EU.Scalar).second)
2353 continue;
2354
2355 // Uses by ephemeral values are free (because the ephemeral value will be
2356 // removed prior to code generation, and so the extraction will be
2357 // removed as well).
2358 if (EphValues.count(EU.User))
2359 continue;
2360
2361 // If we plan to rewrite the tree in a smaller type, we will need to sign
2362 // extend the extracted value back to the original type. Here, we account
2363 // for the extract and the added cost of the sign extend if needed.
2364 auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
2365 auto *ScalarRoot = VectorizableTree[0].Scalars[0];
2366 if (MinBWs.count(ScalarRoot)) {
2367 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
2368 auto Extend =
2369 MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
2370 VecTy = VectorType::get(MinTy, BundleWidth);
2371 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
2372 VecTy, EU.Lane);
2373 } else {
2374 ExtractCost +=
2375 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
2376 }
2377 }
2378
2379 int SpillCost = getSpillCost();
2380 Cost += SpillCost + ExtractCost;
2381
2382 std::string Str;
2383 {
2384 raw_string_ostream OS(Str);
2385 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
2386 << "SLP: Extract Cost = " << ExtractCost << ".\n"
2387 << "SLP: Total Cost = " << Cost << ".\n";
2388 }
2389 DEBUG(dbgs() << Str)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << Str; } } while (false)
;
2390
2391 if (ViewSLPTree)
2392 ViewGraph(this, "SLP" + F->getName(), false, Str);
2393
2394 return Cost;
2395}
2396
2397int BoUpSLP::getGatherCost(Type *Ty) {
2398 int Cost = 0;
2399 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
2400 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
2401 return Cost;
2402}
2403
2404int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
2405 // Find the type of the operands in VL.
2406 Type *ScalarTy = VL[0]->getType();
2407 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2408 ScalarTy = SI->getValueOperand()->getType();
2409 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2410 // Find the cost of inserting/extracting values from the vector.
2411 return getGatherCost(VecTy);
2412}
2413
2414// Reorder commutative operations in alternate shuffle if the resulting vectors
2415// are consecutive loads. This would allow us to vectorize the tree.
2416// If we have something like-
2417// load a[0] - load b[0]
2418// load b[1] + load a[1]
2419// load a[2] - load b[2]
2420// load a[3] + load b[3]
2421// Reordering the second load b[1] load a[1] would allow us to vectorize this
2422// code.
2423void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
2424 SmallVectorImpl<Value *> &Left,
2425 SmallVectorImpl<Value *> &Right) {
2426 // Push left and right operands of binary operation into Left and Right
2427 unsigned AltOpcode = getAltOpcode(Opcode);
2428 (void)AltOpcode;
2429 for (Value *V : VL) {
2430 auto *I = cast<Instruction>(V);
2431 assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&(static_cast <bool> (sameOpcodeOrAlt(Opcode, AltOpcode,
I->getOpcode()) && "Incorrect instruction in vector"
) ? void (0) : __assert_fail ("sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && \"Incorrect instruction in vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2432, __extension__ __PRETTY_FUNCTION__))
2432 "Incorrect instruction in vector")(static_cast <bool> (sameOpcodeOrAlt(Opcode, AltOpcode,
I->getOpcode()) && "Incorrect instruction in vector"
) ? void (0) : __assert_fail ("sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && \"Incorrect instruction in vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2432, __extension__ __PRETTY_FUNCTION__))
;
2433 Left.push_back(I->getOperand(0));
2434 Right.push_back(I->getOperand(1));
2435 }
2436
2437 // Reorder if we have a commutative operation and consecutive access
2438 // are on either side of the alternate instructions.
2439 for (unsigned j = 0; j < VL.size() - 1; ++j) {
2440 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2441 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2442 Instruction *VL1 = cast<Instruction>(VL[j]);
2443 Instruction *VL2 = cast<Instruction>(VL[j + 1]);
2444 if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
2445 std::swap(Left[j], Right[j]);
2446 continue;
2447 } else if (VL2->isCommutative() &&
2448 isConsecutiveAccess(L, L1, *DL, *SE)) {
2449 std::swap(Left[j + 1], Right[j + 1]);
2450 continue;
2451 }
2452 // else unchanged
2453 }
2454 }
2455 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2456 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2457 Instruction *VL1 = cast<Instruction>(VL[j]);
2458 Instruction *VL2 = cast<Instruction>(VL[j + 1]);
2459 if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
2460 std::swap(Left[j], Right[j]);
2461 continue;
2462 } else if (VL2->isCommutative() &&
2463 isConsecutiveAccess(L, L1, *DL, *SE)) {
2464 std::swap(Left[j + 1], Right[j + 1]);
2465 continue;
2466 }
2467 // else unchanged
2468 }
2469 }
2470 }
2471}
2472
2473// Return true if I should be commuted before adding it's left and right
2474// operands to the arrays Left and Right.
2475//
2476// The vectorizer is trying to either have all elements one side being
2477// instruction with the same opcode to enable further vectorization, or having
2478// a splat to lower the vectorizing cost.
2479static bool shouldReorderOperands(
2480 int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
2481 ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
2482 bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
2483 VLeft = I.getOperand(0);
2484 VRight = I.getOperand(1);
2485 // If we have "SplatRight", try to see if commuting is needed to preserve it.
2486 if (SplatRight) {
2487 if (VRight == Right[i - 1])
2488 // Preserve SplatRight
2489 return false;
2490 if (VLeft == Right[i - 1]) {
2491 // Commuting would preserve SplatRight, but we don't want to break
2492 // SplatLeft either, i.e. preserve the original order if possible.
2493 // (FIXME: why do we care?)
2494 if (SplatLeft && VLeft == Left[i - 1])
2495 return false;
2496 return true;
2497 }
2498 }
2499 // Symmetrically handle Right side.
2500 if (SplatLeft) {
2501 if (VLeft == Left[i - 1])
2502 // Preserve SplatLeft
2503 return false;
2504 if (VRight == Left[i - 1])
2505 return true;
2506 }
2507
2508 Instruction *ILeft = dyn_cast<Instruction>(VLeft);
2509 Instruction *IRight = dyn_cast<Instruction>(VRight);
2510
2511 // If we have "AllSameOpcodeRight", try to see if the left operands preserves
2512 // it and not the right, in this case we want to commute.
2513 if (AllSameOpcodeRight) {
2514 unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
2515 if (IRight && RightPrevOpcode == IRight->getOpcode())
2516 // Do not commute, a match on the right preserves AllSameOpcodeRight
2517 return false;
2518 if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
2519 // We have a match and may want to commute, but first check if there is
2520 // not also a match on the existing operands on the Left to preserve
2521 // AllSameOpcodeLeft, i.e. preserve the original order if possible.
2522 // (FIXME: why do we care?)
2523 if (AllSameOpcodeLeft && ILeft &&
2524 cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
2525 return false;
2526 return true;
2527 }
2528 }
2529 // Symmetrically handle Left side.
2530 if (AllSameOpcodeLeft) {
2531 unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
2532 if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
2533 return false;
2534 if (IRight && LeftPrevOpcode == IRight->getOpcode())
2535 return true;
2536 }
2537 return false;
2538}
2539
2540void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
2541 ArrayRef<Value *> VL,
2542 SmallVectorImpl<Value *> &Left,
2543 SmallVectorImpl<Value *> &Right) {
2544 if (!VL.empty()) {
2545 // Peel the first iteration out of the loop since there's nothing
2546 // interesting to do anyway and it simplifies the checks in the loop.
2547 auto *I = cast<Instruction>(VL[0]);
2548 Value *VLeft = I->getOperand(0);
2549 Value *VRight = I->getOperand(1);
2550 if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
2551 // Favor having instruction to the right. FIXME: why?
2552 std::swap(VLeft, VRight);
2553 Left.push_back(VLeft);
2554 Right.push_back(VRight);
2555 }
2556
2557 // Keep track if we have instructions with all the same opcode on one side.
2558 bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
2559 bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
2560 // Keep track if we have one side with all the same value (broadcast).
2561 bool SplatLeft = true;
2562 bool SplatRight = true;
2563
2564 for (unsigned i = 1, e = VL.size(); i != e; ++i) {
2565 Instruction *I = cast<Instruction>(VL[i]);
2566 assert(((I->getOpcode() == Opcode && I->isCommutative()) ||(static_cast <bool> (((I->getOpcode() == Opcode &&
I->isCommutative()) || (I->getOpcode() != Opcode &&
Instruction::isCommutative(Opcode))) && "Can only process commutative instruction"
) ? void (0) : __assert_fail ("((I->getOpcode() == Opcode && I->isCommutative()) || (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && \"Can only process commutative instruction\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2568, __extension__ __PRETTY_FUNCTION__))
2567 (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&(static_cast <bool> (((I->getOpcode() == Opcode &&
I->isCommutative()) || (I->getOpcode() != Opcode &&
Instruction::isCommutative(Opcode))) && "Can only process commutative instruction"
) ? void (0) : __assert_fail ("((I->getOpcode() == Opcode && I->isCommutative()) || (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && \"Can only process commutative instruction\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2568, __extension__ __PRETTY_FUNCTION__))
2568 "Can only process commutative instruction")(static_cast <bool> (((I->getOpcode() == Opcode &&
I->isCommutative()) || (I->getOpcode() != Opcode &&
Instruction::isCommutative(Opcode))) && "Can only process commutative instruction"
) ? void (0) : __assert_fail ("((I->getOpcode() == Opcode && I->isCommutative()) || (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && \"Can only process commutative instruction\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2568, __extension__ __PRETTY_FUNCTION__))
;
2569 // Commute to favor either a splat or maximizing having the same opcodes on
2570 // one side.
2571 Value *VLeft;
2572 Value *VRight;
2573 if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft,
2574 AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,
2575 VRight)) {
2576 Left.push_back(VRight);
2577 Right.push_back(VLeft);
2578 } else {
2579 Left.push_back(VLeft);
2580 Right.push_back(VRight);
2581 }
2582 // Update Splat* and AllSameOpcode* after the insertion.
2583 SplatRight = SplatRight && (Right[i - 1] == Right[i]);
2584 SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
2585 AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
2586 (cast<Instruction>(Left[i - 1])->getOpcode() ==
2587 cast<Instruction>(Left[i])->getOpcode());
2588 AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
2589 (cast<Instruction>(Right[i - 1])->getOpcode() ==
2590 cast<Instruction>(Right[i])->getOpcode());
2591 }
2592
2593 // If one operand end up being broadcast, return this operand order.
2594 if (SplatRight || SplatLeft)
2595 return;
2596
2597 // Finally check if we can get longer vectorizable chain by reordering
2598 // without breaking the good operand order detected above.
2599 // E.g. If we have something like-
2600 // load a[0] load b[0]
2601 // load b[1] load a[1]
2602 // load a[2] load b[2]
2603 // load a[3] load b[3]
2604 // Reordering the second load b[1] load a[1] would allow us to vectorize
2605 // this code and we still retain AllSameOpcode property.
2606 // FIXME: This load reordering might break AllSameOpcode in some rare cases
2607 // such as-
2608 // add a[0],c[0] load b[0]
2609 // add a[1],c[2] load b[1]
2610 // b[2] load b[2]
2611 // add a[3],c[3] load b[3]
2612 for (unsigned j = 0; j < VL.size() - 1; ++j) {
2613 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2614 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2615 if (isConsecutiveAccess(L, L1, *DL, *SE)) {
2616 std::swap(Left[j + 1], Right[j + 1]);
2617 continue;
2618 }
2619 }
2620 }
2621 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2622 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2623 if (isConsecutiveAccess(L, L1, *DL, *SE)) {
2624 std::swap(Left[j + 1], Right[j + 1]);
2625 continue;
2626 }
2627 }
2628 }
2629 // else unchanged
2630 }
2631}
2632
2633void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
2634 // Get the basic block this bundle is in. All instructions in the bundle
2635 // should be in this block.
2636 auto *Front = cast<Instruction>(OpValue);
2637 auto *BB = Front->getParent();
2638 const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode();
2639 const unsigned AltOpcode = getAltOpcode(Opcode);
2640 assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {(static_cast <bool> (llvm::all_of(make_range(VL.begin()
, VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt
(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode(
)) || cast<Instruction>(V)->getParent() == BB; })) ?
void (0) : __assert_fail ("llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode()) || cast<Instruction>(V)->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2644, __extension__ __PRETTY_FUNCTION__))
2641 return !sameOpcodeOrAlt(Opcode, AltOpcode,(static_cast <bool> (llvm::all_of(make_range(VL.begin()
, VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt
(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode(
)) || cast<Instruction>(V)->getParent() == BB; })) ?
void (0) : __assert_fail ("llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode()) || cast<Instruction>(V)->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2644, __extension__ __PRETTY_FUNCTION__))
2642 cast<Instruction>(V)->getOpcode()) ||(static_cast <bool> (llvm::all_of(make_range(VL.begin()
, VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt
(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode(
)) || cast<Instruction>(V)->getParent() == BB; })) ?
void (0) : __assert_fail ("llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode()) || cast<Instruction>(V)->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2644, __extension__ __PRETTY_FUNCTION__))
2643 cast<Instruction>(V)->getParent() == BB;(static_cast <bool> (llvm::all_of(make_range(VL.begin()
, VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt
(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode(
)) || cast<Instruction>(V)->getParent() == BB; })) ?
void (0) : __assert_fail ("llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode()) || cast<Instruction>(V)->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2644, __extension__ __PRETTY_FUNCTION__))
2644 }))(static_cast <bool> (llvm::all_of(make_range(VL.begin()
, VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt
(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode(
)) || cast<Instruction>(V)->getParent() == BB; })) ?
void (0) : __assert_fail ("llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode()) || cast<Instruction>(V)->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2644, __extension__ __PRETTY_FUNCTION__))
;
2645
2646 // The last instruction in the bundle in program order.
2647 Instruction *LastInst = nullptr;
2648
2649 // Find the last instruction. The common case should be that BB has been
2650 // scheduled, and the last instruction is VL.back(). So we start with
2651 // VL.back() and iterate over schedule data until we reach the end of the
2652 // bundle. The end of the bundle is marked by null ScheduleData.
2653 if (BlocksSchedules.count(BB)) {
2654 auto *Bundle =
2655 BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));
2656 if (Bundle && Bundle->isPartOfBundle())
2657 for (; Bundle; Bundle = Bundle->NextInBundle)
2658 if (Bundle->OpValue == Bundle->Inst)
2659 LastInst = Bundle->Inst;
2660 }
2661
2662 // LastInst can still be null at this point if there's either not an entry
2663 // for BB in BlocksSchedules or there's no ScheduleData available for
2664 // VL.back(). This can be the case if buildTree_rec aborts for various
2665 // reasons (e.g., the maximum recursion depth is reached, the maximum region
2666 // size is reached, etc.). ScheduleData is initialized in the scheduling
2667 // "dry-run".
2668 //
2669 // If this happens, we can still find the last instruction by brute force. We
2670 // iterate forwards from Front (inclusive) until we either see all
2671 // instructions in the bundle or reach the end of the block. If Front is the
2672 // last instruction in program order, LastInst will be set to Front, and we
2673 // will visit all the remaining instructions in the block.
2674 //
2675 // One of the reasons we exit early from buildTree_rec is to place an upper
2676 // bound on compile-time. Thus, taking an additional compile-time hit here is
2677 // not ideal. However, this should be exceedingly rare since it requires that
2678 // we both exit early from buildTree_rec and that the bundle be out-of-order
2679 // (causing us to iterate all the way to the end of the block).
2680 if (!LastInst) {
2681 SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
2682 for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
2683 if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))
2684 LastInst = &I;
2685 if (Bundle.empty())
2686 break;
2687 }
2688 }
2689
2690 // Set the insertion point after the last instruction in the bundle. Set the
2691 // debug location to Front.
2692 Builder.SetInsertPoint(BB, ++LastInst->getIterator());
2693 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
2694}
2695
2696Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
2697 Value *Vec = UndefValue::get(Ty);
2698 // Generate the 'InsertElement' instruction.
2699 for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
2700 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2701 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2702 GatherSeq.insert(Insrt);
2703 CSEBlocks.insert(Insrt->getParent());
2704
2705 // Add to our 'need-to-extract' list.
2706 if (TreeEntry *E = getTreeEntry(VL[i])) {
2707 // Find which lane we need to extract.
2708 int FoundLane = -1;
2709 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
2710 // Is this the lane of the scalar that we are looking for ?
2711 if (E->Scalars[Lane] == VL[i]) {
2712 FoundLane = Lane;
2713 break;
2714 }
2715 }
2716 assert(FoundLane >= 0 && "Could not find the correct lane")(static_cast <bool> (FoundLane >= 0 && "Could not find the correct lane"
) ? void (0) : __assert_fail ("FoundLane >= 0 && \"Could not find the correct lane\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2716, __extension__ __PRETTY_FUNCTION__))
;
2717 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2718 }
2719 }
2720 }
2721
2722 return Vec;
2723}
2724
2725Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
2726 if (const TreeEntry *En = getTreeEntry(OpValue)) {
2727 if (En->isSame(VL) && En->VectorizedValue)
2728 return En->VectorizedValue;
2729 }
2730 return nullptr;
2731}
2732
2733Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
2734 InstructionsState S = getSameOpcode(VL);
2735 if (S.Opcode) {
2736 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
2737 if (E->isSame(VL))
2738 return vectorizeTree(E);
2739 }
2740 }
2741
2742 Type *ScalarTy = S.OpValue->getType();
2743 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
2744 ScalarTy = SI->getValueOperand()->getType();
2745 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2746
2747 return Gather(VL, VecTy);
2748}
2749
2750Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2751 IRBuilder<>::InsertPointGuard Guard(Builder);
2752
2753 if (E->VectorizedValue) {
2754 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*E->Scalars[0] << ".\n"; } } while (false)
;
2755 return E->VectorizedValue;
2756 }
2757
2758 InstructionsState S = getSameOpcode(E->Scalars);
2759 Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2760 Type *ScalarTy = VL0->getType();
2761 if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
2762 ScalarTy = SI->getValueOperand()->getType();
2763 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
2764
2765 if (E->NeedToGather) {
2766 setInsertPointAfterBundle(E->Scalars, VL0);
2767 auto *V = Gather(E->Scalars, VecTy);
2768 E->VectorizedValue = V;
2769 return V;
2770 }
2771
2772 unsigned ShuffleOrOp = S.IsAltShuffle ?
2773 (unsigned) Instruction::ShuffleVector : S.Opcode;
2774 switch (ShuffleOrOp) {
2775 case Instruction::PHI: {
2776 PHINode *PH = dyn_cast<PHINode>(VL0);
2777 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
2778 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2779 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
2780 E->VectorizedValue = NewPhi;
2781
2782 // PHINodes may have multiple entries from the same block. We want to
2783 // visit every block once.
2784 SmallSet<BasicBlock*, 4> VisitedBBs;
2785
2786 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2787 ValueList Operands;
2788 BasicBlock *IBB = PH->getIncomingBlock(i);
2789
2790 if (!VisitedBBs.insert(IBB).second) {
2791 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
2792 continue;
2793 }
2794
2795 // Prepare the operand vector.
2796 for (Value *V : E->Scalars)
2797 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2798
2799 Builder.SetInsertPoint(IBB->getTerminator());
2800 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2801 Value *Vec = vectorizeTree(Operands);
2802 NewPhi->addIncoming(Vec, IBB);
2803 }
2804
2805 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&(static_cast <bool> (NewPhi->getNumIncomingValues() ==
PH->getNumIncomingValues() && "Invalid number of incoming values"
) ? void (0) : __assert_fail ("NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && \"Invalid number of incoming values\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2806, __extension__ __PRETTY_FUNCTION__))
2806 "Invalid number of incoming values")(static_cast <bool> (NewPhi->getNumIncomingValues() ==
PH->getNumIncomingValues() && "Invalid number of incoming values"
) ? void (0) : __assert_fail ("NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && \"Invalid number of incoming values\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2806, __extension__ __PRETTY_FUNCTION__))
;
2807 return NewPhi;
2808 }
2809
2810 case Instruction::ExtractElement: {
2811 if (canReuseExtract(E->Scalars, VL0)) {
2812 Value *V = VL0->getOperand(0);
2813 E->VectorizedValue = V;
2814 return V;
2815 }
2816 setInsertPointAfterBundle(E->Scalars, VL0);
2817 auto *V = Gather(E->Scalars, VecTy);
2818 E->VectorizedValue = V;
2819 return V;
2820 }
2821 case Instruction::ExtractValue: {
2822 if (canReuseExtract(E->Scalars, VL0)) {
2823 LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
2824 Builder.SetInsertPoint(LI);
2825 PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
2826 Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
2827 LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
2828 E->VectorizedValue = V;
2829 return propagateMetadata(V, E->Scalars);
2830 }
2831 setInsertPointAfterBundle(E->Scalars, VL0);
2832 auto *V = Gather(E->Scalars, VecTy);
2833 E->VectorizedValue = V;
2834 return V;
2835 }
2836 case Instruction::ZExt:
2837 case Instruction::SExt:
2838 case Instruction::FPToUI:
2839 case Instruction::FPToSI:
2840 case Instruction::FPExt:
2841 case Instruction::PtrToInt:
2842 case Instruction::IntToPtr:
2843 case Instruction::SIToFP:
2844 case Instruction::UIToFP:
2845 case Instruction::Trunc:
2846 case Instruction::FPTrunc:
2847 case Instruction::BitCast: {
2848 ValueList INVL;
2849 for (Value *V : E->Scalars)
2850 INVL.push_back(cast<Instruction>(V)->getOperand(0));
2851
2852 setInsertPointAfterBundle(E->Scalars, VL0);
2853
2854 Value *InVec = vectorizeTree(INVL);
2855
2856 if (Value *V = alreadyVectorized(E->Scalars, VL0))
2857 return V;
2858
2859 CastInst *CI = dyn_cast<CastInst>(VL0);
2860 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2861 E->VectorizedValue = V;
2862 ++NumVectorInstructions;
2863 return V;
2864 }
2865 case Instruction::FCmp:
2866 case Instruction::ICmp: {
2867 ValueList LHSV, RHSV;
2868 for (Value *V : E->Scalars) {
2869 LHSV.push_back(cast<Instruction>(V)->getOperand(0));
2870 RHSV.push_back(cast<Instruction>(V)->getOperand(1));
2871 }
2872
2873 setInsertPointAfterBundle(E->Scalars, VL0);
2874
2875 Value *L = vectorizeTree(LHSV);
2876 Value *R = vectorizeTree(RHSV);
2877
2878 if (Value *V = alreadyVectorized(E->Scalars, VL0))
2879 return V;
2880
2881 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2882 Value *V;
2883 if (S.Opcode == Instruction::FCmp)
2884 V = Builder.CreateFCmp(P0, L, R);
2885 else
2886 V = Builder.CreateICmp(P0, L, R);
2887
2888 E->VectorizedValue = V;
2889 propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
2890 ++NumVectorInstructions;
2891 return V;
2892 }
2893 case Instruction::Select: {
2894 ValueList TrueVec, FalseVec, CondVec;
2895 for (Value *V : E->Scalars) {
2896 CondVec.push_back(cast<Instruction>(V)->getOperand(0));
2897 TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
2898 FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
2899 }
2900
2901 setInsertPointAfterBundle(E->Scalars, VL0);
2902
2903 Value *Cond = vectorizeTree(CondVec);
2904 Value *True = vectorizeTree(TrueVec);
2905 Value *False = vectorizeTree(FalseVec);
2906
2907 if (Value *V = alreadyVectorized(E->Scalars, VL0))
2908 return V;
2909
2910 Value *V = Builder.CreateSelect(Cond, True, False);
2911 E->VectorizedValue = V;
2912 ++NumVectorInstructions;
2913 return V;
2914 }
2915 case Instruction::Add:
2916 case Instruction::FAdd:
2917 case Instruction::Sub:
2918 case Instruction::FSub:
2919 case Instruction::Mul:
2920 case Instruction::FMul:
2921 case Instruction::UDiv:
2922 case Instruction::SDiv:
2923 case Instruction::FDiv:
2924 case Instruction::URem:
2925 case Instruction::SRem:
2926 case Instruction::FRem:
2927 case Instruction::Shl:
2928 case Instruction::LShr:
2929 case Instruction::AShr:
2930 case Instruction::And:
2931 case Instruction::Or:
2932 case Instruction::Xor: {
2933 ValueList LHSVL, RHSVL;
2934 if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2935 reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
2936 RHSVL);
2937 else
2938 for (Value *V : E->Scalars) {
2939 auto *I = cast<Instruction>(V);
2940 LHSVL.push_back(I->getOperand(0));
2941 RHSVL.push_back(I->getOperand(1));
2942 }
2943
2944 setInsertPointAfterBundle(E->Scalars, VL0);
2945
2946 Value *LHS = vectorizeTree(LHSVL);
2947 Value *RHS = vectorizeTree(RHSVL);
2948
2949 if (Value *V = alreadyVectorized(E->Scalars, VL0))
2950 return V;
2951
2952 Value *V = Builder.CreateBinOp(
2953 static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
2954 E->VectorizedValue = V;
2955 propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
2956 ++NumVectorInstructions;
2957
2958 if (Instruction *I = dyn_cast<Instruction>(V))
2959 return propagateMetadata(I, E->Scalars);
2960
2961 return V;
2962 }
2963 case Instruction::Load: {
2964 // Loads are inserted at the head of the tree because we don't want to
2965 // sink them all the way down past store instructions.
2966 setInsertPointAfterBundle(E->Scalars, VL0);
2967
2968 LoadInst *LI = cast<LoadInst>(VL0);
2969 Type *ScalarLoadTy = LI->getType();
2970 unsigned AS = LI->getPointerAddressSpace();
2971
2972 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2973 VecTy->getPointerTo(AS));
2974
2975 // The pointer operand uses an in-tree scalar so we add the new BitCast to
2976 // ExternalUses list to make sure that an extract will be generated in the
2977 // future.
2978 Value *PO = LI->getPointerOperand();
2979 if (getTreeEntry(PO))
2980 ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
2981
2982 unsigned Alignment = LI->getAlignment();
2983 LI = Builder.CreateLoad(VecPtr);
2984 if (!Alignment) {
2985 Alignment = DL->getABITypeAlignment(ScalarLoadTy);
2986 }
2987 LI->setAlignment(Alignment);
2988 E->VectorizedValue = LI;
2989 ++NumVectorInstructions;
2990 return propagateMetadata(LI, E->Scalars);
2991 }
2992 case Instruction::Store: {
2993 StoreInst *SI = cast<StoreInst>(VL0);
2994 unsigned Alignment = SI->getAlignment();
2995 unsigned AS = SI->getPointerAddressSpace();
2996
2997 ValueList ScalarStoreValues;
2998 for (Value *V : E->Scalars)
2999 ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
3000
3001 setInsertPointAfterBundle(E->Scalars, VL0);
3002
3003 Value *VecValue = vectorizeTree(ScalarStoreValues);
3004 Value *ScalarPtr = SI->getPointerOperand();
3005 Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
3006 StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
3007
3008 // The pointer operand uses an in-tree scalar, so add the new BitCast to
3009 // ExternalUses to make sure that an extract will be generated in the
3010 // future.
3011 if (getTreeEntry(ScalarPtr))
3012 ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
3013
3014 if (!Alignment)
3015 Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
3016
3017 S->setAlignment(Alignment);
3018 E->VectorizedValue = S;
3019 ++NumVectorInstructions;
3020 return propagateMetadata(S, E->Scalars);
3021 }
3022 case Instruction::GetElementPtr: {
3023 setInsertPointAfterBundle(E->Scalars, VL0);
3024
3025 ValueList Op0VL;
3026 for (Value *V : E->Scalars)
3027 Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
3028
3029 Value *Op0 = vectorizeTree(Op0VL);
3030
3031 std::vector<Value *> OpVecs;
3032 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
3033 ++j) {
3034 ValueList OpVL;
3035 for (Value *V : E->Scalars)
3036 OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
3037
3038 Value *OpVec = vectorizeTree(OpVL);
3039 OpVecs.push_back(OpVec);
3040 }
3041
3042 Value *V = Builder.CreateGEP(
3043 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
3044 E->VectorizedValue = V;
3045 ++NumVectorInstructions;
3046
3047 if (Instruction *I = dyn_cast<Instruction>(V))
3048 return propagateMetadata(I, E->Scalars);
3049
3050 return V;
3051 }
3052 case Instruction::Call: {
3053 CallInst *CI = cast<CallInst>(VL0);
3054 setInsertPointAfterBundle(E->Scalars, VL0);
3055 Function *FI;
3056 Intrinsic::ID IID = Intrinsic::not_intrinsic;
3057 Value *ScalarArg = nullptr;
3058 if (CI && (FI = CI->getCalledFunction())) {
3059 IID = FI->getIntrinsicID();
3060 }
3061 std::vector<Value *> OpVecs;
3062 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
3063 ValueList OpVL;
3064 // ctlz,cttz and powi are special intrinsics whose second argument is
3065 // a scalar. This argument should not be vectorized.
3066 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
3067 CallInst *CEI = cast<CallInst>(VL0);
3068 ScalarArg = CEI->getArgOperand(j);
3069 OpVecs.push_back(CEI->getArgOperand(j));
3070 continue;
3071 }
3072 for (Value *V : E->Scalars) {
3073 CallInst *CEI = cast<CallInst>(V);
3074 OpVL.push_back(CEI->getArgOperand(j));
3075 }
3076
3077 Value *OpVec = vectorizeTree(OpVL);
3078 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: OpVec[" << j << "]: "
<< *OpVec << "\n"; } } while (false)
;
3079 OpVecs.push_back(OpVec);
3080 }
3081
3082 Module *M = F->getParent();
3083 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3084 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
3085 Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
3086 SmallVector<OperandBundleDef, 1> OpBundles;
3087 CI->getOperandBundlesAsDefs(OpBundles);
3088 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
3089
3090 // The scalar argument uses an in-tree scalar so we add the new vectorized
3091 // call to ExternalUses list to make sure that an extract will be
3092 // generated in the future.
3093 if (ScalarArg && getTreeEntry(ScalarArg))
3094 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
3095
3096 E->VectorizedValue = V;
3097 propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
3098 ++NumVectorInstructions;
3099 return V;
3100 }
3101 case Instruction::ShuffleVector: {
3102 ValueList LHSVL, RHSVL;
3103 assert(Instruction::isBinaryOp(S.Opcode) &&(static_cast <bool> (Instruction::isBinaryOp(S.Opcode) &&
"Invalid Shuffle Vector Operand") ? void (0) : __assert_fail
("Instruction::isBinaryOp(S.Opcode) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3104, __extension__ __PRETTY_FUNCTION__))
3104 "Invalid Shuffle Vector Operand")(static_cast <bool> (Instruction::isBinaryOp(S.Opcode) &&
"Invalid Shuffle Vector Operand") ? void (0) : __assert_fail
("Instruction::isBinaryOp(S.Opcode) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3104, __extension__ __PRETTY_FUNCTION__))
;
3105 reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
3106 setInsertPointAfterBundle(E->Scalars, VL0);
3107
3108 Value *LHS = vectorizeTree(LHSVL);
3109 Value *RHS = vectorizeTree(RHSVL);
3110
3111 if (Value *V = alreadyVectorized(E->Scalars, VL0))
3112 return V;
3113
3114 // Create a vector of LHS op1 RHS
3115 Value *V0 = Builder.CreateBinOp(
3116 static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
3117
3118 unsigned AltOpcode = getAltOpcode(S.Opcode);
3119 // Create a vector of LHS op2 RHS
3120 Value *V1 = Builder.CreateBinOp(
3121 static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
3122
3123 // Create shuffle to take alternate operations from the vector.
3124 // Also, gather up odd and even scalar ops to propagate IR flags to
3125 // each vector operation.
3126 ValueList OddScalars, EvenScalars;
3127 unsigned e = E->Scalars.size();
3128 SmallVector<Constant *, 8> Mask(e);
3129 for (unsigned i = 0; i < e; ++i) {
3130 if (isOdd(i)) {
3131 Mask[i] = Builder.getInt32(e + i);
3132 OddScalars.push_back(E->Scalars[i]);
3133 } else {
3134 Mask[i] = Builder.getInt32(i);
3135 EvenScalars.push_back(E->Scalars[i]);
3136 }
3137 }
3138
3139 Value *ShuffleMask = ConstantVector::get(Mask);
3140 propagateIRFlags(V0, EvenScalars);
3141 propagateIRFlags(V1, OddScalars);
3142
3143 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
3144 E->VectorizedValue = V;
3145 ++NumVectorInstructions;
3146 if (Instruction *I = dyn_cast<Instruction>(V))
3147 return propagateMetadata(I, E->Scalars);
3148
3149 return V;
3150 }
3151 default:
3152 llvm_unreachable("unknown inst")::llvm::llvm_unreachable_internal("unknown inst", "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3152)
;
3153 }
3154 return nullptr;
3155}
3156
3157Value *BoUpSLP::vectorizeTree() {
3158 ExtraValueToDebugLocsMap ExternallyUsedValues;
3159 return vectorizeTree(ExternallyUsedValues);
3160}
3161
3162Value *
3163BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
3164 // All blocks must be scheduled before any instructions are inserted.
3165 for (auto &BSIter : BlocksSchedules) {
3166 scheduleBlock(BSIter.second.get());
3167 }
3168
3169 Builder.SetInsertPoint(&F->getEntryBlock().front());
3170 auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
3171
3172 // If the vectorized tree can be rewritten in a smaller type, we truncate the
3173 // vectorized root. InstCombine will then rewrite the entire expression. We
3174 // sign extend the extracted values below.
3175 auto *ScalarRoot = VectorizableTree[0].Scalars[0];
3176 if (MinBWs.count(ScalarRoot)) {
3177 if (auto *I = dyn_cast<Instruction>(VectorRoot))
3178 Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
3179 auto BundleWidth = VectorizableTree[0].Scalars.size();
3180 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
3181 auto *VecTy = VectorType::get(MinTy, BundleWidth);
3182 auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
3183 VectorizableTree[0].VectorizedValue = Trunc;
3184 }
3185
3186 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Extracting " << ExternalUses
.size() << " values .\n"; } } while (false)
;
3187
3188 // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
3189 // specified by ScalarType.
3190 auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
3191 if (!MinBWs.count(ScalarRoot))
3192 return Ex;
3193 if (MinBWs[ScalarRoot].second)
3194 return Builder.CreateSExt(Ex, ScalarType);
3195 return Builder.CreateZExt(Ex, ScalarType);
3196 };
3197
3198 // Extract all of the elements with the external uses.
3199 for (const auto &ExternalUse : ExternalUses) {
3200 Value *Scalar = ExternalUse.Scalar;
3201 llvm::User *User = ExternalUse.User;
3202
3203 // Skip users that we already RAUW. This happens when one instruction
3204 // has multiple uses of the same value.
3205 if (User && !is_contained(Scalar->users(), User))
3206 continue;
3207 TreeEntry *E = getTreeEntry(Scalar);
3208 assert(E && "Invalid scalar")(static_cast <bool> (E && "Invalid scalar") ? void
(0) : __assert_fail ("E && \"Invalid scalar\"", "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3208, __extension__ __PRETTY_FUNCTION__))
;
3209 assert(!E->NeedToGather && "Extracting from a gather list")(static_cast <bool> (!E->NeedToGather && "Extracting from a gather list"
) ? void (0) : __assert_fail ("!E->NeedToGather && \"Extracting from a gather list\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3209, __extension__ __PRETTY_FUNCTION__))
;
3210
3211 Value *Vec = E->VectorizedValue;
3212 assert(Vec && "Can't find vectorizable value")(static_cast <bool> (Vec && "Can't find vectorizable value"
) ? void (0) : __assert_fail ("Vec && \"Can't find vectorizable value\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3212, __extension__ __PRETTY_FUNCTION__))
;
3213
3214 Value *Lane = Builder.getInt32(ExternalUse.Lane);
3215 // If User == nullptr, the Scalar is used as extra arg. Generate
3216 // ExtractElement instruction and update the record for this scalar in
3217 // ExternallyUsedValues.
3218 if (!User) {
3219 assert(ExternallyUsedValues.count(Scalar) &&(static_cast <bool> (ExternallyUsedValues.count(Scalar)
&& "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? void (0) : __assert_fail ("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3221, __extension__ __PRETTY_FUNCTION__))
3220 "Scalar with nullptr as an external user must be registered in "(static_cast <bool> (ExternallyUsedValues.count(Scalar)
&& "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? void (0) : __assert_fail ("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3221, __extension__ __PRETTY_FUNCTION__))
3221 "ExternallyUsedValues map")(static_cast <bool> (ExternallyUsedValues.count(Scalar)
&& "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? void (0) : __assert_fail ("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3221, __extension__ __PRETTY_FUNCTION__))
;
3222 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
3223 Builder.SetInsertPoint(VecI->getParent(),
3224 std::next(VecI->getIterator()));
3225 } else {
3226 Builder.SetInsertPoint(&F->getEntryBlock().front());
3227 }
3228 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3229 Ex = extend(ScalarRoot, Ex, Scalar->getType());
3230 CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
3231 auto &Locs = ExternallyUsedValues[Scalar];
3232 ExternallyUsedValues.insert({Ex, Locs});
3233 ExternallyUsedValues.erase(Scalar);
3234 continue;
3235 }
3236
3237 // Generate extracts for out-of-tree users.
3238 // Find the insertion point for the extractelement lane.
3239 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
3240 if (PHINode *PH = dyn_cast<PHINode>(User)) {
3241 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
3242 if (PH->getIncomingValue(i) == Scalar) {
3243 TerminatorInst *IncomingTerminator =
3244 PH->getIncomingBlock(i)->getTerminator();
3245 if (isa<CatchSwitchInst>(IncomingTerminator)) {
3246 Builder.SetInsertPoint(VecI->getParent(),
3247 std::next(VecI->getIterator()));
3248 } else {
3249 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
3250 }
3251 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3252 Ex = extend(ScalarRoot, Ex, Scalar->getType());
3253 CSEBlocks.insert(PH->getIncomingBlock(i));
3254 PH->setOperand(i, Ex);
3255 }
3256 }
3257 } else {
3258 Builder.SetInsertPoint(cast<Instruction>(User));
3259 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3260 Ex = extend(ScalarRoot, Ex, Scalar->getType());
3261 CSEBlocks.insert(cast<Instruction>(User)->getParent());
3262 User->replaceUsesOfWith(Scalar, Ex);
3263 }
3264 } else {
3265 Builder.SetInsertPoint(&F->getEntryBlock().front());
3266 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3267 Ex = extend(ScalarRoot, Ex, Scalar->getType());
3268 CSEBlocks.insert(&F->getEntryBlock());
3269 User->replaceUsesOfWith(Scalar, Ex);
3270 }
3271
3272 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Replaced:" << *User <<
".\n"; } } while (false)
;
3273 }
3274
3275 // For each vectorized value:
3276 for (TreeEntry &EIdx : VectorizableTree) {
3277 TreeEntry *Entry = &EIdx;
3278
3279 // No need to handle users of gathered values.
3280 if (Entry->NeedToGather)
3281 continue;
3282
3283 assert(Entry->VectorizedValue && "Can't find vectorizable value")(static_cast <bool> (Entry->VectorizedValue &&
"Can't find vectorizable value") ? void (0) : __assert_fail (
"Entry->VectorizedValue && \"Can't find vectorizable value\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3283, __extension__ __PRETTY_FUNCTION__))
;
3284
3285 // For each lane:
3286 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
3287 Value *Scalar = Entry->Scalars[Lane];
3288
3289 Type *Ty = Scalar->getType();
3290 if (!Ty->isVoidTy()) {
3291#ifndef NDEBUG
3292 for (User *U : Scalar->users()) {
3293 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tvalidating user:" <<
*U << ".\n"; } } while (false)
;
3294
3295 // It is legal to replace users in the ignorelist by undef.
3296 assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&(static_cast <bool> ((getTreeEntry(U) || is_contained(UserIgnoreList
, U)) && "Replacing out-of-tree value with undef") ? void
(0) : __assert_fail ("(getTreeEntry(U) || is_contained(UserIgnoreList, U)) && \"Replacing out-of-tree value with undef\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3297, __extension__ __PRETTY_FUNCTION__))
3297 "Replacing out-of-tree value with undef")(static_cast <bool> ((getTreeEntry(U) || is_contained(UserIgnoreList
, U)) && "Replacing out-of-tree value with undef") ? void
(0) : __assert_fail ("(getTreeEntry(U) || is_contained(UserIgnoreList, U)) && \"Replacing out-of-tree value with undef\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3297, __extension__ __PRETTY_FUNCTION__))
;
3298 }
3299#endif
3300 Value *Undef = UndefValue::get(Ty);
3301 Scalar->replaceAllUsesWith(Undef);
3302 }
3303 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tErasing scalar:" << *
Scalar << ".\n"; } } while (false)
;
3304 eraseInstruction(cast<Instruction>(Scalar));
3305 }
3306 }
3307
3308 Builder.ClearInsertionPoint();
3309
3310 return VectorizableTree[0].VectorizedValue;
3311}
3312
3313void BoUpSLP::optimizeGatherSequence() {
3314 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Optimizing " << GatherSeq
.size() << " gather sequences instructions.\n"; } } while
(false)
3315 << " gather sequences instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Optimizing " << GatherSeq
.size() << " gather sequences instructions.\n"; } } while
(false)
;
3316 // LICM InsertElementInst sequences.
3317 for (Instruction *it : GatherSeq) {
3318 InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
3319
3320 if (!Insert)
3321 continue;
3322
3323 // Check if this block is inside a loop.
3324 Loop *L = LI->getLoopFor(Insert->getParent());
3325 if (!L)
3326 continue;
3327
3328 // Check if it has a preheader.
3329 BasicBlock *PreHeader = L->getLoopPreheader();
3330 if (!PreHeader)
3331 continue;
3332
3333 // If the vector or the element that we insert into it are
3334 // instructions that are defined in this basic block then we can't
3335 // hoist this instruction.
3336 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
3337 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
3338 if (CurrVec && L->contains(CurrVec))
3339 continue;
3340 if (NewElem && L->contains(NewElem))
3341 continue;
3342
3343 // We can hoist this instruction. Move it to the pre-header.
3344 Insert->moveBefore(PreHeader->getTerminator());
3345 }
3346
3347 // Make a list of all reachable blocks in our CSE queue.
3348 SmallVector<const DomTreeNode *, 8> CSEWorkList;
3349 CSEWorkList.reserve(CSEBlocks.size());
3350 for (BasicBlock *BB : CSEBlocks)
3351 if (DomTreeNode *N = DT->getNode(BB)) {
3352 assert(DT->isReachableFromEntry(N))(static_cast <bool> (DT->isReachableFromEntry(N)) ? void
(0) : __assert_fail ("DT->isReachableFromEntry(N)", "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3352, __extension__ __PRETTY_FUNCTION__))
;
3353 CSEWorkList.push_back(N);
3354 }
3355
3356 // Sort blocks by domination. This ensures we visit a block after all blocks
3357 // dominating it are visited.
3358 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
3359 [this](const DomTreeNode *A, const DomTreeNode *B) {
3360 return DT->properlyDominates(A, B);
3361 });
3362
3363 // Perform O(N^2) search over the gather sequences and merge identical
3364 // instructions. TODO: We can further optimize this scan if we split the
3365 // instructions into different buckets based on the insert lane.
3366 SmallVector<Instruction *, 16> Visited;
3367 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
21
Assuming 'I' is not equal to 'E'
22
Loop condition is true. Entering loop body
24
Loop condition is true. Entering loop body
26
Loop condition is true. Entering loop body
28
Loop condition is true. Entering loop body
3368 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&(static_cast <bool> ((I == CSEWorkList.begin() || !DT->
dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"
) ? void (0) : __assert_fail ("(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3369, __extension__ __PRETTY_FUNCTION__))
3369 "Worklist not sorted properly!")(static_cast <bool> ((I == CSEWorkList.begin() || !DT->
dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"
) ? void (0) : __assert_fail ("(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3369, __extension__ __PRETTY_FUNCTION__))
;
3370 BasicBlock *BB = (*I)->getBlock();
29
Called C++ object pointer is null
3371 // For all instructions in blocks containing gather sequences:
3372 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
23
Loop condition is false. Execution continues on line 3367
25
Loop condition is false. Execution continues on line 3367
27
Loop condition is false. Execution continues on line 3367
3373 Instruction *In = &*it++;
3374 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
3375 continue;
3376
3377 // Check if we can replace this instruction with any of the
3378 // visited instructions.
3379 for (Instruction *v : Visited) {
3380 if (In->isIdenticalTo(v) &&
3381 DT->dominates(v->getParent(), In->getParent())) {
3382 In->replaceAllUsesWith(v);
3383 eraseInstruction(In);
3384 In = nullptr;
3385 break;
3386 }
3387 }
3388 if (In) {
3389 assert(!is_contained(Visited, In))(static_cast <bool> (!is_contained(Visited, In)) ? void
(0) : __assert_fail ("!is_contained(Visited, In)", "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3389, __extension__ __PRETTY_FUNCTION__))
;
3390 Visited.push_back(In);
3391 }
3392 }
3393 }
3394 CSEBlocks.clear();
3395 GatherSeq.clear();
3396}
3397
3398// Groups the instructions to a bundle (which is then a single scheduling entity)
3399// and schedules instructions until the bundle gets ready.
3400bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
3401 BoUpSLP *SLP, Value *OpValue) {
3402 if (isa<PHINode>(OpValue))
3403 return true;
3404
3405 // Initialize the instruction bundle.
3406 Instruction *OldScheduleEnd = ScheduleEnd;
3407 ScheduleData *PrevInBundle = nullptr;
3408 ScheduleData *Bundle = nullptr;
3409 bool ReSchedule = false;
3410 DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle: " << *OpValue
<< "\n"; } } while (false)
;
3411
3412 // Make sure that the scheduling region contains all
3413 // instructions of the bundle.
3414 for (Value *V : VL) {
3415 if (!extendSchedulingRegion(V, OpValue))
3416 return false;
3417 }
3418
3419 for (Value *V : VL) {
3420 ScheduleData *BundleMember = getScheduleData(V);
3421 assert(BundleMember &&(static_cast <bool> (BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"
) ? void (0) : __assert_fail ("BundleMember && \"no ScheduleData for bundle member (maybe not in same basic block)\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3422, __extension__ __PRETTY_FUNCTION__))
3422 "no ScheduleData for bundle member (maybe not in same basic block)")(static_cast <bool> (BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"
) ? void (0) : __assert_fail ("BundleMember && \"no ScheduleData for bundle member (maybe not in same basic block)\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3422, __extension__ __PRETTY_FUNCTION__))
;
3423 if (BundleMember->IsScheduled) {
3424 // A bundle member was scheduled as single instruction before and now
3425 // needs to be scheduled as part of the bundle. We just get rid of the
3426 // existing schedule.
3427 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMemberdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: reset schedule because " <<
*BundleMember << " was already scheduled\n"; } } while
(false)
3428 << " was already scheduled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: reset schedule because " <<
*BundleMember << " was already scheduled\n"; } } while
(false)
;
3429 ReSchedule = true;
3430 }
3431 assert(BundleMember->isSchedulingEntity() &&(static_cast <bool> (BundleMember->isSchedulingEntity
() && "bundle member already part of other bundle") ?
void (0) : __assert_fail ("BundleMember->isSchedulingEntity() && \"bundle member already part of other bundle\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3432, __extension__ __PRETTY_FUNCTION__))
3432 "bundle member already part of other bundle")(static_cast <bool> (BundleMember->isSchedulingEntity
() && "bundle member already part of other bundle") ?
void (0) : __assert_fail ("BundleMember->isSchedulingEntity() && \"bundle member already part of other bundle\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3432, __extension__ __PRETTY_FUNCTION__))
;
3433 if (PrevInBundle) {
3434 PrevInBundle->NextInBundle = BundleMember;
3435 } else {
3436 Bundle = BundleMember;
3437 }
3438 BundleMember->UnscheduledDepsInBundle = 0;
3439 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
3440
3441 // Group the instructions to a bundle.
3442 BundleMember->FirstInBundle = Bundle;
3443 PrevInBundle = BundleMember;
3444 }
3445 if (ScheduleEnd != OldScheduleEnd) {
3446 // The scheduling region got new instructions at the lower end (or it is a
3447 // new region for the first bundle). This makes it necessary to
3448 // recalculate all dependencies.
3449 // It is seldom that this needs to be done a second time after adding the
3450 // initial bundle to the region.
3451 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3452 doForAllOpcodes(I, [](ScheduleData *SD) {
3453 SD->clearDependencies();
3454 });
3455 }
3456 ReSchedule = true;
3457 }
3458 if (ReSchedule) {
3459 resetSchedule();
3460 initialFillReadyList(ReadyInsts);
3461 }
3462
3463 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: try schedule bundle " <<
*Bundle << " in block " << BB->getName() <<
"\n"; } } while (false)
3464 << BB->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: try schedule bundle " <<
*Bundle << " in block " << BB->getName() <<
"\n"; } } while (false)
;
3465
3466 calculateDependencies(Bundle, true, SLP);
3467
3468 // Now try to schedule the new bundle. As soon as the bundle is "ready" it
3469 // means that there are no cyclic dependencies and we can schedule it.
3470 // Note that's important that we don't "schedule" the bundle yet (see
3471 // cancelScheduling).
3472 while (!Bundle->isReady() && !ReadyInsts.empty()) {
3473
3474 ScheduleData *pickedSD = ReadyInsts.back();
3475 ReadyInsts.pop_back();
3476
3477 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
3478 schedule(pickedSD, ReadyInsts);
3479 }
3480 }
3481 if (!Bundle->isReady()) {
3482 cancelScheduling(VL, OpValue);
3483 return false;
3484 }
3485 return true;
3486}
3487
3488void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
3489 Value *OpValue) {
3490 if (isa<PHINode>(OpValue))
3491 return;
3492
3493 ScheduleData *Bundle = getScheduleData(OpValue);
3494 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: cancel scheduling of " <<
*Bundle << "\n"; } } while (false)
;
3495 assert(!Bundle->IsScheduled &&(static_cast <bool> (!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled") ? void (0)
: __assert_fail ("!Bundle->IsScheduled && \"Can't cancel bundle which is already scheduled\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3496, __extension__ __PRETTY_FUNCTION__))
3496 "Can't cancel bundle which is already scheduled")(static_cast <bool> (!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled") ? void (0)
: __assert_fail ("!Bundle->IsScheduled && \"Can't cancel bundle which is already scheduled\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3496, __extension__ __PRETTY_FUNCTION__))
;
3497 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&(static_cast <bool> (Bundle->isSchedulingEntity() &&
Bundle->isPartOfBundle() && "tried to unbundle something which is not a bundle"
) ? void (0) : __assert_fail ("Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && \"tried to unbundle something which is not a bundle\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3498, __extension__ __PRETTY_FUNCTION__))
3498 "tried to unbundle something which is not a bundle")(static_cast <bool> (Bundle->isSchedulingEntity() &&
Bundle->isPartOfBundle() && "tried to unbundle something which is not a bundle"
) ? void (0) : __assert_fail ("Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && \"tried to unbundle something which is not a bundle\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3498, __extension__ __PRETTY_FUNCTION__))
;
3499
3500 // Un-bundle: make single instructions out of the bundle.
3501 ScheduleData *BundleMember = Bundle;
3502 while (BundleMember) {
3503 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links")(static_cast <bool> (BundleMember->FirstInBundle == Bundle
&& "corrupt bundle links") ? void (0) : __assert_fail
("BundleMember->FirstInBundle == Bundle && \"corrupt bundle links\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3503, __extension__ __PRETTY_FUNCTION__))
;
3504 BundleMember->FirstInBundle = BundleMember;
3505 ScheduleData *Next = BundleMember->NextInBundle;
3506 BundleMember->NextInBundle = nullptr;
3507 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
3508 if (BundleMember->UnscheduledDepsInBundle == 0) {
3509 ReadyInsts.insert(BundleMember);
3510 }
3511 BundleMember = Next;
3512 }
3513}
3514
3515BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
3516 // Allocate a new ScheduleData for the instruction.
3517 if (ChunkPos >= ChunkSize) {
3518 ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
3519 ChunkPos = 0;
3520 }
3521 return &(ScheduleDataChunks.back()[ChunkPos++]);
3522}
3523
3524bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
3525 Value *OpValue) {
3526 if (getScheduleData(V, isOneOf(OpValue, V)))
3527 return true;
3528 Instruction *I = dyn_cast<Instruction>(V);
3529 assert(I && "bundle member must be an instruction")(static_cast <bool> (I && "bundle member must be an instruction"
) ? void (0) : __assert_fail ("I && \"bundle member must be an instruction\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3529, __extension__ __PRETTY_FUNCTION__))
;
3530 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled")(static_cast <bool> (!isa<PHINode>(I) && "phi nodes don't need to be scheduled"
) ? void (0) : __assert_fail ("!isa<PHINode>(I) && \"phi nodes don't need to be scheduled\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3530, __extension__ __PRETTY_FUNCTION__))
;
3531 auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
3532 ScheduleData *ISD = getScheduleData(I);
3533 if (!ISD)
3534 return false;
3535 assert(isInSchedulingRegion(ISD) &&(static_cast <bool> (isInSchedulingRegion(ISD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(ISD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3536, __extension__ __PRETTY_FUNCTION__))
3536 "ScheduleData not in scheduling region")(static_cast <bool> (isInSchedulingRegion(ISD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(ISD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3536, __extension__ __PRETTY_FUNCTION__))
;
3537 ScheduleData *SD = allocateScheduleDataChunks();
3538 SD->Inst = I;
3539 SD->init(SchedulingRegionID, OpValue);
3540 ExtraScheduleDataMap[I][OpValue] = SD;
3541 return true;
3542 };
3543 if (CheckSheduleForI(I))
3544 return true;
3545 if (!ScheduleStart) {
3546 // It's the first instruction in the new region.
3547 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
3548 ScheduleStart = I;
3549 ScheduleEnd = I->getNextNode();
3550 if (isOneOf(OpValue, I) != I)
3551 CheckSheduleForI(I);
3552 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?")(static_cast <bool> (ScheduleEnd && "tried to vectorize a TerminatorInst?"
) ? void (0) : __assert_fail ("ScheduleEnd && \"tried to vectorize a TerminatorInst?\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3552, __extension__ __PRETTY_FUNCTION__))
;
3553 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initialize schedule region to "
<< *I << "\n"; } } while (false)
;
3554 return true;
3555 }
3556 // Search up and down at the same time, because we don't know if the new
3557 // instruction is above or below the existing scheduling region.
3558 BasicBlock::reverse_iterator UpIter =
3559 ++ScheduleStart->getIterator().getReverse();
3560 BasicBlock::reverse_iterator UpperEnd = BB->rend();
3561 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
3562 BasicBlock::iterator LowerEnd = BB->end();
3563 while (true) {
3564 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
3565 DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: exceeded schedule region size limit\n"
; } } while (false)
;
3566 return false;
3567 }
3568
3569 if (UpIter != UpperEnd) {
3570 if (&*UpIter == I) {
3571 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
3572 ScheduleStart = I;
3573 if (isOneOf(OpValue, I) != I)
3574 CheckSheduleForI(I);
3575 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region start to "
<< *I << "\n"; } } while (false)
;
3576 return true;
3577 }
3578 UpIter++;
3579 }
3580 if (DownIter != LowerEnd) {
3581 if (&*DownIter == I) {
3582 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
3583 nullptr);
3584 ScheduleEnd = I->getNextNode();
3585 if (isOneOf(OpValue, I) != I)
3586 CheckSheduleForI(I);
3587 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?")(static_cast <bool> (ScheduleEnd && "tried to vectorize a TerminatorInst?"
) ? void (0) : __assert_fail ("ScheduleEnd && \"tried to vectorize a TerminatorInst?\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3587, __extension__ __PRETTY_FUNCTION__))
;
3588 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region end to "
<< *I << "\n"; } } while (false)
;
3589 return true;
3590 }
3591 DownIter++;
3592 }
3593 assert((UpIter != UpperEnd || DownIter != LowerEnd) &&(static_cast <bool> ((UpIter != UpperEnd || DownIter !=
LowerEnd) && "instruction not found in block") ? void
(0) : __assert_fail ("(UpIter != UpperEnd || DownIter != LowerEnd) && \"instruction not found in block\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3594, __extension__ __PRETTY_FUNCTION__))
3594 "instruction not found in block")(static_cast <bool> ((UpIter != UpperEnd || DownIter !=
LowerEnd) && "instruction not found in block") ? void
(0) : __assert_fail ("(UpIter != UpperEnd || DownIter != LowerEnd) && \"instruction not found in block\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3594, __extension__ __PRETTY_FUNCTION__))
;
3595 }
3596 return true;
3597}
3598
3599void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
3600 Instruction *ToI,
3601 ScheduleData *PrevLoadStore,
3602 ScheduleData *NextLoadStore) {
3603 ScheduleData *CurrentLoadStore = PrevLoadStore;
3604 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
3605 ScheduleData *SD = ScheduleDataMap[I];
3606 if (!SD) {
3607 SD = allocateScheduleDataChunks();
3608 ScheduleDataMap[I] = SD;
3609 SD->Inst = I;
3610 }
3611 assert(!isInSchedulingRegion(SD) &&(static_cast <bool> (!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region") ? void (0) :
__assert_fail ("!isInSchedulingRegion(SD) && \"new ScheduleData already in scheduling region\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3612, __extension__ __PRETTY_FUNCTION__))
3612 "new ScheduleData already in scheduling region")(static_cast <bool> (!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region") ? void (0) :
__assert_fail ("!isInSchedulingRegion(SD) && \"new ScheduleData already in scheduling region\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3612, __extension__ __PRETTY_FUNCTION__))
;
3613 SD->init(SchedulingRegionID, I);
3614
3615 if (I->mayReadOrWriteMemory() &&
3616 (!isa<IntrinsicInst>(I) ||
3617 cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
3618 // Update the linked list of memory accessing instructions.
3619 if (CurrentLoadStore) {
3620 CurrentLoadStore->NextLoadStore = SD;
3621 } else {
3622 FirstLoadStoreInRegion = SD;
3623 }
3624 CurrentLoadStore = SD;
3625 }
3626 }
3627 if (NextLoadStore) {
3628 if (CurrentLoadStore)
3629 CurrentLoadStore->NextLoadStore = NextLoadStore;
3630 } else {
3631 LastLoadStoreInRegion = CurrentLoadStore;
3632 }
3633}
3634
3635void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
3636 bool InsertInReadyList,
3637 BoUpSLP *SLP) {
3638 assert(SD->isSchedulingEntity())(static_cast <bool> (SD->isSchedulingEntity()) ? void
(0) : __assert_fail ("SD->isSchedulingEntity()", "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3638, __extension__ __PRETTY_FUNCTION__))
;
3639
3640 SmallVector<ScheduleData *, 10> WorkList;
3641 WorkList.push_back(SD);
3642
3643 while (!WorkList.empty()) {
3644 ScheduleData *SD = WorkList.back();
3645 WorkList.pop_back();
3646
3647 ScheduleData *BundleMember = SD;
3648 while (BundleMember) {
3649 assert(isInSchedulingRegion(BundleMember))(static_cast <bool> (isInSchedulingRegion(BundleMember)
) ? void (0) : __assert_fail ("isInSchedulingRegion(BundleMember)"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3649, __extension__ __PRETTY_FUNCTION__))
;
3650 if (!BundleMember->hasValidDependencies()) {
3651
3652 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: update deps of " <<
*BundleMember << "\n"; } } while (false)
;
3653 BundleMember->Dependencies = 0;
3654 BundleMember->resetUnscheduledDeps();
3655
3656 // Handle def-use chain dependencies.
3657 if (BundleMember->OpValue != BundleMember->Inst) {
3658 ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
3659 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3660 BundleMember->Dependencies++;
3661 ScheduleData *DestBundle = UseSD->FirstInBundle;
3662 if (!DestBundle->IsScheduled)
3663 BundleMember->incrementUnscheduledDeps(1);
3664 if (!DestBundle->hasValidDependencies())
3665 WorkList.push_back(DestBundle);
3666 }
3667 } else {
3668 for (User *U : BundleMember->Inst->users()) {
3669 if (isa<Instruction>(U)) {
3670 ScheduleData *UseSD = getScheduleData(U);
3671 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3672 BundleMember->Dependencies++;
3673 ScheduleData *DestBundle = UseSD->FirstInBundle;
3674 if (!DestBundle->IsScheduled)
3675 BundleMember->incrementUnscheduledDeps(1);
3676 if (!DestBundle->hasValidDependencies())
3677 WorkList.push_back(DestBundle);
3678 }
3679 } else {
3680 // I'm not sure if this can ever happen. But we need to be safe.
3681 // This lets the instruction/bundle never be scheduled and
3682 // eventually disable vectorization.
3683 BundleMember->Dependencies++;
3684 BundleMember->incrementUnscheduledDeps(1);
3685 }
3686 }
3687 }
3688
3689 // Handle the memory dependencies.
3690 ScheduleData *DepDest = BundleMember->NextLoadStore;
3691 if (DepDest) {
3692 Instruction *SrcInst = BundleMember->Inst;
3693 MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
3694 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
3695 unsigned numAliased = 0;
3696 unsigned DistToSrc = 1;
3697
3698 while (DepDest) {
3699 assert(isInSchedulingRegion(DepDest))(static_cast <bool> (isInSchedulingRegion(DepDest)) ? void
(0) : __assert_fail ("isInSchedulingRegion(DepDest)", "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3699, __extension__ __PRETTY_FUNCTION__))
;
3700
3701 // We have two limits to reduce the complexity:
3702 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
3703 // SLP->isAliased (which is the expensive part in this loop).
3704 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
3705 // the whole loop (even if the loop is fast, it's quadratic).
3706 // It's important for the loop break condition (see below) to
3707 // check this limit even between two read-only instructions.
3708 if (DistToSrc >= MaxMemDepDistance ||
3709 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
3710 (numAliased >= AliasedCheckLimit ||
3711 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
3712
3713 // We increment the counter only if the locations are aliased
3714 // (instead of counting all alias checks). This gives a better
3715 // balance between reduced runtime and accurate dependencies.
3716 numAliased++;
3717
3718 DepDest->MemoryDependencies.push_back(BundleMember);
3719 BundleMember->Dependencies++;
3720 ScheduleData *DestBundle = DepDest->FirstInBundle;
3721 if (!DestBundle->IsScheduled) {
3722 BundleMember->incrementUnscheduledDeps(1);
3723 }
3724 if (!DestBundle->hasValidDependencies()) {
3725 WorkList.push_back(DestBundle);
3726 }
3727 }
3728 DepDest = DepDest->NextLoadStore;
3729
3730 // Example, explaining the loop break condition: Let's assume our
3731 // starting instruction is i0 and MaxMemDepDistance = 3.
3732 //
3733 // +--------v--v--v
3734 // i0,i1,i2,i3,i4,i5,i6,i7,i8
3735 // +--------^--^--^
3736 //
3737 // MaxMemDepDistance let us stop alias-checking at i3 and we add
3738 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
3739 // Previously we already added dependencies from i3 to i6,i7,i8
3740 // (because of MaxMemDepDistance). As we added a dependency from
3741 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
3742 // and we can abort this loop at i6.
3743 if (DistToSrc >= 2 * MaxMemDepDistance)
3744 break;
3745 DistToSrc++;
3746 }
3747 }
3748 }
3749 BundleMember = BundleMember->NextInBundle;
3750 }
3751 if (InsertInReadyList && SD->isReady()) {
3752 ReadyInsts.push_back(SD);
3753 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready on update: " <<
*SD->Inst << "\n"; } } while (false)
;
3754 }
3755 }
3756}
3757
3758void BoUpSLP::BlockScheduling::resetSchedule() {
3759 assert(ScheduleStart &&(static_cast <bool> (ScheduleStart && "tried to reset schedule on block which has not been scheduled"
) ? void (0) : __assert_fail ("ScheduleStart && \"tried to reset schedule on block which has not been scheduled\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3760, __extension__ __PRETTY_FUNCTION__))
3760 "tried to reset schedule on block which has not been scheduled")(static_cast <bool> (ScheduleStart && "tried to reset schedule on block which has not been scheduled"
) ? void (0) : __assert_fail ("ScheduleStart && \"tried to reset schedule on block which has not been scheduled\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3760, __extension__ __PRETTY_FUNCTION__))
;
3761 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3762 doForAllOpcodes(I, [&](ScheduleData *SD) {
3763 assert(isInSchedulingRegion(SD) &&(static_cast <bool> (isInSchedulingRegion(SD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(SD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3764, __extension__ __PRETTY_FUNCTION__))
3764 "ScheduleData not in scheduling region")(static_cast <bool> (isInSchedulingRegion(SD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(SD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3764, __extension__ __PRETTY_FUNCTION__))
;
3765 SD->IsScheduled = false;
3766 SD->resetUnscheduledDeps();
3767 });
3768 }
3769 ReadyInsts.clear();
3770}
3771
3772void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
3773 if (!BS->ScheduleStart)
3774 return;
3775
3776 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule block " << BS
->BB->getName() << "\n"; } } while (false)
;
3777
3778 BS->resetSchedule();
3779
3780 // For the real scheduling we use a more sophisticated ready-list: it is
3781 // sorted by the original instruction location. This lets the final schedule
3782 // be as close as possible to the original instruction order.
3783 struct ScheduleDataCompare {
3784 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
3785 return SD2->SchedulingPriority < SD1->SchedulingPriority;
3786 }
3787 };
3788 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3789
3790 // Ensure that all dependency data is updated and fill the ready-list with
3791 // initial instructions.
3792 int Idx = 0;
3793 int NumToSchedule = 0;
3794 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3795 I = I->getNextNode()) {
3796 BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
3797 assert(SD->isPartOfBundle() ==(static_cast <bool> (SD->isPartOfBundle() == (getTreeEntry
(SD->Inst) != nullptr) && "scheduler and vectorizer bundle mismatch"
) ? void (0) : __assert_fail ("SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && \"scheduler and vectorizer bundle mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3799, __extension__ __PRETTY_FUNCTION__))
3798 (getTreeEntry(SD->Inst) != nullptr) &&(static_cast <bool> (SD->isPartOfBundle() == (getTreeEntry
(SD->Inst) != nullptr) && "scheduler and vectorizer bundle mismatch"
) ? void (0) : __assert_fail ("SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && \"scheduler and vectorizer bundle mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3799, __extension__ __PRETTY_FUNCTION__))
3799 "scheduler and vectorizer bundle mismatch")(static_cast <bool> (SD->isPartOfBundle() == (getTreeEntry
(SD->Inst) != nullptr) && "scheduler and vectorizer bundle mismatch"
) ? void (0) : __assert_fail ("SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && \"scheduler and vectorizer bundle mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3799, __extension__ __PRETTY_FUNCTION__))
;
3800 SD->FirstInBundle->SchedulingPriority = Idx++;
3801 if (SD->isSchedulingEntity()) {
3802 BS->calculateDependencies(SD, false, this);
3803 NumToSchedule++;
3804 }
3805 });
3806 }
3807 BS->initialFillReadyList(ReadyInsts);
3808
3809 Instruction *LastScheduledInst = BS->ScheduleEnd;
3810
3811 // Do the "real" scheduling.
3812 while (!ReadyInsts.empty()) {
3813 ScheduleData *picked = *ReadyInsts.begin();
3814 ReadyInsts.erase(ReadyInsts.begin());
3815
3816 // Move the scheduled instruction(s) to their dedicated places, if not
3817 // there yet.
3818 ScheduleData *BundleMember = picked;
3819 while (BundleMember) {
3820 Instruction *pickedInst = BundleMember->Inst;
3821 if (LastScheduledInst->getNextNode() != pickedInst) {
3822 BS->BB->getInstList().remove(pickedInst);
3823 BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
3824 pickedInst);
3825 }
3826 LastScheduledInst = pickedInst;
3827 BundleMember = BundleMember->NextInBundle;
3828 }
3829
3830 BS->schedule(picked, ReadyInsts);
3831 NumToSchedule--;
3832 }
3833 assert(NumToSchedule == 0 && "could not schedule all instructions")(static_cast <bool> (NumToSchedule == 0 && "could not schedule all instructions"
) ? void (0) : __assert_fail ("NumToSchedule == 0 && \"could not schedule all instructions\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3833, __extension__ __PRETTY_FUNCTION__))
;
3834
3835 // Avoid duplicate scheduling of the block.
3836 BS->ScheduleStart = nullptr;
3837}
3838
3839unsigned BoUpSLP::getVectorElementSize(Value *V) {
3840 // If V is a store, just return the width of the stored value without
3841 // traversing the expression tree. This is the common case.
3842 if (auto *Store = dyn_cast<StoreInst>(V))
3843 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
3844
3845 // If V is not a store, we can traverse the expression tree to find loads
3846 // that feed it. The type of the loaded value may indicate a more suitable
3847 // width than V's type. We want to base the vector element size on the width
3848 // of memory operations where possible.
3849 SmallVector<Instruction *, 16> Worklist;
3850 SmallPtrSet<Instruction *, 16> Visited;
3851 if (auto *I = dyn_cast<Instruction>(V))
3852 Worklist.push_back(I);
3853
3854 // Traverse the expression tree in bottom-up order looking for loads. If we
3855 // encounter an instruciton we don't yet handle, we give up.
3856 auto MaxWidth = 0u;
3857 auto FoundUnknownInst = false;
3858 while (!Worklist.empty() && !FoundUnknownInst) {
3859 auto *I = Worklist.pop_back_val();
3860 Visited.insert(I);
3861
3862 // We should only be looking at scalar instructions here. If the current
3863 // instruction has a vector type, give up.
3864 auto *Ty = I->getType();
3865 if (isa<VectorType>(Ty))
3866 FoundUnknownInst = true;
3867
3868 // If the current instruction is a load, update MaxWidth to reflect the
3869 // width of the loaded value.
3870 else if (isa<LoadInst>(I))
3871 MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
3872
3873 // Otherwise, we need to visit the operands of the instruction. We only
3874 // handle the interesting cases from buildTree here. If an operand is an
3875 // instruction we haven't yet visited, we add it to the worklist.
3876 else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
3877 isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
3878 for (Use &U : I->operands())
3879 if (auto *J = dyn_cast<Instruction>(U.get()))
3880 if (!Visited.count(J))
3881 Worklist.push_back(J);
3882 }
3883
3884 // If we don't yet handle the instruction, give up.
3885 else
3886 FoundUnknownInst = true;
3887 }
3888
3889 // If we didn't encounter a memory access in the expression tree, or if we
3890 // gave up for some reason, just return the width of V.
3891 if (!MaxWidth || FoundUnknownInst)
3892 return DL->getTypeSizeInBits(V->getType());
3893
3894 // Otherwise, return the maximum width we found.
3895 return MaxWidth;
3896}
3897
3898// Determine if a value V in a vectorizable expression Expr can be demoted to a
3899// smaller type with a truncation. We collect the values that will be demoted
3900// in ToDemote and additional roots that require investigating in Roots.
3901static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
3902 SmallVectorImpl<Value *> &ToDemote,
3903 SmallVectorImpl<Value *> &Roots) {
3904 // We can always demote constants.
3905 if (isa<Constant>(V)) {
3906 ToDemote.push_back(V);
3907 return true;
3908 }
3909
3910 // If the value is not an instruction in the expression with only one use, it
3911 // cannot be demoted.
3912 auto *I = dyn_cast<Instruction>(V);
3913 if (!I || !I->hasOneUse() || !Expr.count(I))
3914 return false;
3915
3916 switch (I->getOpcode()) {
3917
3918 // We can always demote truncations and extensions. Since truncations can
3919 // seed additional demotion, we save the truncated value.
3920 case Instruction::Trunc:
3921 Roots.push_back(I->getOperand(0));
3922 case Instruction::ZExt:
3923 case Instruction::SExt:
3924 break;
3925
3926 // We can demote certain binary operations if we can demote both of their
3927 // operands.
3928 case Instruction::Add:
3929 case Instruction::Sub:
3930 case Instruction::Mul:
3931 case Instruction::And:
3932 case Instruction::Or:
3933 case Instruction::Xor:
3934 if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
3935 !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
3936 return false;
3937 break;
3938
3939 // We can demote selects if we can demote their true and false values.
3940 case Instruction::Select: {
3941 SelectInst *SI = cast<SelectInst>(I);
3942 if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
3943 !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
3944 return false;
3945 break;
3946 }
3947
3948 // We can demote phis if we can demote all their incoming operands. Note that
3949 // we don't need to worry about cycles since we ensure single use above.
3950 case Instruction::PHI: {
3951 PHINode *PN = cast<PHINode>(I);
3952 for (Value *IncValue : PN->incoming_values())
3953 if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
3954 return false;
3955 break;
3956 }
3957
3958 // Otherwise, conservatively give up.
3959 default:
3960 return false;
3961 }
3962
3963 // Record the value that we can demote.
3964 ToDemote.push_back(V);
3965 return true;
3966}
3967
3968void BoUpSLP::computeMinimumValueSizes() {
3969 // If there are no external uses, the expression tree must be rooted by a
3970 // store. We can't demote in-memory values, so there is nothing to do here.
3971 if (ExternalUses.empty())
3972 return;
3973
3974 // We only attempt to truncate integer expressions.
3975 auto &TreeRoot = VectorizableTree[0].Scalars;
3976 auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
3977 if (!TreeRootIT)
3978 return;
3979
3980 // If the expression is not rooted by a store, these roots should have
3981 // external uses. We will rely on InstCombine to rewrite the expression in
3982 // the narrower type. However, InstCombine only rewrites single-use values.
3983 // This means that if a tree entry other than a root is used externally, it
3984 // must have multiple uses and InstCombine will not rewrite it. The code
3985 // below ensures that only the roots are used externally.
3986 SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
3987 for (auto &EU : ExternalUses)
3988 if (!Expr.erase(EU.Scalar))
3989 return;
3990 if (!Expr.empty())
3991 return;
3992
3993 // Collect the scalar values of the vectorizable expression. We will use this
3994 // context to determine which values can be demoted. If we see a truncation,
3995 // we mark it as seeding another demotion.
3996 for (auto &Entry : VectorizableTree)
3997 Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
3998
3999 // Ensure the roots of the vectorizable tree don't form a cycle. They must
4000 // have a single external user that is not in the vectorizable tree.
4001 for (auto *Root : TreeRoot)
4002 if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
4003 return;
4004
4005 // Conservatively determine if we can actually truncate the roots of the
4006 // expression. Collect the values that can be demoted in ToDemote and
4007 // additional roots that require investigating in Roots.
4008 SmallVector<Value *, 32> ToDemote;
4009 SmallVector<Value *, 4> Roots;
4010 for (auto *Root : TreeRoot)
4011 if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
4012 return;
4013
4014 // The maximum bit width required to represent all the values that can be
4015 // demoted without loss of precision. It would be safe to truncate the roots
4016 // of the expression to this width.
4017 auto MaxBitWidth = 8u;
4018
4019 // We first check if all the bits of the roots are demanded. If they're not,
4020 // we can truncate the roots to this narrower type.
4021 for (auto *Root : TreeRoot) {
4022 auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
4023 MaxBitWidth = std::max<unsigned>(
4024 Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
4025 }
4026
4027 // True if the roots can be zero-extended back to their original type, rather
4028 // than sign-extended. We know that if the leading bits are not demanded, we
4029 // can safely zero-extend. So we initialize IsKnownPositive to True.
4030 bool IsKnownPositive = true;
4031
4032 // If all the bits of the roots are demanded, we can try a little harder to
4033 // compute a narrower type. This can happen, for example, if the roots are
4034 // getelementptr indices. InstCombine promotes these indices to the pointer
4035 // width. Thus, all their bits are technically demanded even though the
4036 // address computation might be vectorized in a smaller type.
4037 //
4038 // We start by looking at each entry that can be demoted. We compute the
4039 // maximum bit width required to store the scalar by using ValueTracking to
4040 // compute the number of high-order bits we can truncate.
4041 if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
4042 MaxBitWidth = 8u;
4043
4044 // Determine if the sign bit of all the roots is known to be zero. If not,
4045 // IsKnownPositive is set to False.
4046 IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
4047 KnownBits Known = computeKnownBits(R, *DL);
4048 return Known.isNonNegative();
4049 });
4050
4051 // Determine the maximum number of bits required to store the scalar
4052 // values.
4053 for (auto *Scalar : ToDemote) {
4054 auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
4055 auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
4056 MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
4057 }
4058
4059 // If we can't prove that the sign bit is zero, we must add one to the
4060 // maximum bit width to account for the unknown sign bit. This preserves
4061 // the existing sign bit so we can safely sign-extend the root back to the
4062 // original type. Otherwise, if we know the sign bit is zero, we will
4063 // zero-extend the root instead.
4064 //
4065 // FIXME: This is somewhat suboptimal, as there will be cases where adding
4066 // one to the maximum bit width will yield a larger-than-necessary
4067 // type. In general, we need to add an extra bit only if we can't
4068 // prove that the upper bit of the original type is equal to the
4069 // upper bit of the proposed smaller type. If these two bits are the
4070 // same (either zero or one) we know that sign-extending from the
4071 // smaller type will result in the same value. Here, since we can't
4072 // yet prove this, we are just making the proposed smaller type
4073 // larger to ensure correctness.
4074 if (!IsKnownPositive)
4075 ++MaxBitWidth;
4076 }
4077
4078 // Round MaxBitWidth up to the next power-of-two.
4079 if (!isPowerOf2_64(MaxBitWidth))
4080 MaxBitWidth = NextPowerOf2(MaxBitWidth);
4081
4082 // If the maximum bit width we compute is less than the with of the roots'
4083 // type, we can proceed with the narrowing. Otherwise, do nothing.
4084 if (MaxBitWidth >= TreeRootIT->getBitWidth())
4085 return;
4086
4087 // If we can truncate the root, we must collect additional values that might
4088 // be demoted as a result. That is, those seeded by truncations we will
4089 // modify.
4090 while (!Roots.empty())
4091 collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
4092
4093 // Finally, map the values we can demote to the maximum bit with we computed.
4094 for (auto *Scalar : ToDemote)
4095 MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
4096}
4097
4098namespace {
4099
4100/// The SLPVectorizer Pass.
4101struct SLPVectorizer : public FunctionPass {
4102 SLPVectorizerPass Impl;
4103
4104 /// Pass identification, replacement for typeid
4105 static char ID;
4106
4107 explicit SLPVectorizer() : FunctionPass(ID) {
4108 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
4109 }
4110
4111 bool doInitialization(Module &M) override {
4112 return false;
4113 }
4114
4115 bool runOnFunction(Function &F) override {
4116 if (skipFunction(F))
4117 return false;
4118
4119 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
4120 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
4121 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
4122 auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
4123 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
4124 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
4125 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
4126 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
4127 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
4128 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
4129
4130 return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
4131 }
4132
4133 void getAnalysisUsage(AnalysisUsage &AU) const override {
4134 FunctionPass::getAnalysisUsage(AU);
4135 AU.addRequired<AssumptionCacheTracker>();
4136 AU.addRequired<ScalarEvolutionWrapperPass>();
4137 AU.addRequired<AAResultsWrapperPass>();
4138 AU.addRequired<TargetTransformInfoWrapperPass>();
4139 AU.addRequired<LoopInfoWrapperPass>();
4140 AU.addRequired<DominatorTreeWrapperPass>();
4141 AU.addRequired<DemandedBitsWrapperPass>();
4142 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
4143 AU.addPreserved<LoopInfoWrapperPass>();
4144 AU.addPreserved<DominatorTreeWrapperPass>();
4145 AU.addPreserved<AAResultsWrapperPass>();
4146 AU.addPreserved<GlobalsAAWrapperPass>();
4147 AU.setPreservesCFG();
4148 }
4149};
4150
4151} // end anonymous namespace
4152
4153PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
4154 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
4155 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
4156 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
4157 auto *AA = &AM.getResult<AAManager>(F);
4158 auto *LI = &AM.getResult<LoopAnalysis>(F);
4159 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
4160 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
4161 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
4162 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
4163
4164 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
1
Calling 'SLPVectorizerPass::runImpl'
4165 if (!Changed)
4166 return PreservedAnalyses::all();
4167
4168 PreservedAnalyses PA;
4169 PA.preserveSet<CFGAnalyses>();
4170 PA.preserve<AAManager>();
4171 PA.preserve<GlobalsAA>();
4172 return PA;
4173}
4174
4175bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
4176 TargetTransformInfo *TTI_,
4177 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
4178 LoopInfo *LI_, DominatorTree *DT_,
4179 AssumptionCache *AC_, DemandedBits *DB_,
4180 OptimizationRemarkEmitter *ORE_) {
4181 SE = SE_;
4182 TTI = TTI_;
4183 TLI = TLI_;
4184 AA = AA_;
4185 LI = LI_;
4186 DT = DT_;
4187 AC = AC_;
4188 DB = DB_;
4189 DL = &F.getParent()->getDataLayout();
4190
4191 Stores.clear();
4192 GEPs.clear();
4193 bool Changed = false;
4194
4195 // If the target claims to have no vector registers don't attempt
4196 // vectorization.
4197 if (!TTI->getNumberOfRegisters(true))
2
Assuming the condition is false
3
Taking false branch
4198 return false;
4199
4200 // Don't vectorize when the attribute NoImplicitFloat is used.
4201 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
4
Assuming the condition is false
5
Taking false branch
4202 return false;
4203
4204 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing blocks in " <<
F.getName() << ".\n"; } } while (false)
;
4205
4206 // Use the bottom up slp vectorizer to construct chains that start with
4207 // store instructions.
4208 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
4209
4210 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
4211 // delete instructions.
4212
4213 // Scan the blocks in the function in post order.
4214 for (auto BB : post_order(&F.getEntryBlock())) {
4215 collectSeedInstructions(BB);
4216
4217 // Vectorize trees that end at stores.
4218 if (!Stores.empty()) {
6
Assuming the condition is false
7
Taking false branch
10
Assuming the condition is false
11
Taking false branch
14
Assuming the condition is false
15
Taking false branch
4219 DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found stores for " << Stores
.size() << " underlying objects.\n"; } } while (false)
4220 << " underlying objects.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found stores for " << Stores
.size() << " underlying objects.\n"; } } while (false)
;
4221 Changed |= vectorizeStoreChains(R);
4222 }
4223
4224 // Vectorize trees that end at reductions.
4225 Changed |= vectorizeChainsInBlock(BB, R);
4226
4227 // Vectorize the index computations of getelementptr instructions. This
4228 // is primarily intended to catch gather-like idioms ending at
4229 // non-consecutive loads.
4230 if (!GEPs.empty()) {
8
Assuming the condition is false
9
Taking false branch
12
Assuming the condition is false
13
Taking false branch
16
Assuming the condition is false
17
Taking false branch
4231 DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found GEPs for " << GEPs
.size() << " underlying objects.\n"; } } while (false)
4232 << " underlying objects.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found GEPs for " << GEPs
.size() << " underlying objects.\n"; } } while (false)
;
4233 Changed |= vectorizeGEPIndices(BB, R);
4234 }
4235 }
4236
4237 if (Changed) {
18
Assuming 'Changed' is not equal to 0
19
Taking true branch
4238 R.optimizeGatherSequence();
20
Calling 'BoUpSLP::optimizeGatherSequence'
4239 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: vectorized \"" << F.getName
() << "\"\n"; } } while (false)
;
4240 DEBUG(verifyFunction(F))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { verifyFunction(F); } } while (false)
;
4241 }
4242 return Changed;
4243}
4244
4245/// \brief Check that the Values in the slice in VL array are still existent in
4246/// the WeakTrackingVH array.
4247/// Vectorization of part of the VL array may cause later values in the VL array
4248/// to become invalid. We track when this has happened in the WeakTrackingVH
4249/// array.
4250static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
4251 ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
4252 unsigned SliceSize) {
4253 VL = VL.slice(SliceBegin, SliceSize);
4254 VH = VH.slice(SliceBegin, SliceSize);
4255 return !std::equal(VL.begin(), VL.end(), VH.begin());
4256}
4257
4258bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
4259 unsigned VecRegSize) {
4260 unsigned ChainLen = Chain.size();
4261 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLendo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< ChainLen << "\n"; } } while (false)
4262 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< ChainLen << "\n"; } } while (false)
;
4263 unsigned Sz = R.getVectorElementSize(Chain[0]);
4264 unsigned VF = VecRegSize / Sz;
4265
4266 if (!isPowerOf2_32(Sz) || VF < 2)
4267 return false;
4268
4269 // Keep track of values that were deleted by vectorizing in the loop below.
4270 SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
4271
4272 bool Changed = false;
4273 // Look for profitable vectorizable trees at all offsets, starting at zero.
4274 for (unsigned i = 0, e = ChainLen; i < e; ++i) {
4275 if (i + VF > e)
4276 break;
4277
4278 // Check that a previous iteration of this loop did not delete the Value.
4279 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
4280 continue;
4281
4282 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << VF <<
" stores at offset " << i << "\n"; } } while (false
)
4283 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << VF <<
" stores at offset " << i << "\n"; } } while (false
)
;
4284 ArrayRef<Value *> Operands = Chain.slice(i, VF);
4285
4286 R.buildTree(Operands);
4287 if (R.isTreeTinyAndNotFullyVectorizable())
4288 continue;
4289
4290 R.computeMinimumValueSizes();
4291
4292 int Cost = R.getTreeCost();
4293
4294 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found cost=" << Cost <<
" for VF=" << VF << "\n"; } } while (false)
;
4295 if (Cost < -SLPCostThreshold) {
4296 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Decided to vectorize cost=" <<
Cost << "\n"; } } while (false)
;
4297
4298 using namespace ore;
4299
4300 R.getORE()->emit(OptimizationRemark(SV_NAME"slp-vectorizer", "StoresVectorized",
4301 cast<StoreInst>(Chain[i]))
4302 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
4303 << " and with tree size "
4304 << NV("TreeSize", R.getTreeSize()));
4305
4306 R.vectorizeTree();
4307
4308 // Move to the next bundle.
4309 i += VF - 1;
4310 Changed = true;
4311 }
4312 }
4313
4314 return Changed;
4315}
4316
4317bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
4318 BoUpSLP &R) {
4319 SetVector<StoreInst *> Heads;
4320 SmallDenseSet<StoreInst *> Tails;
4321 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
4322
4323 // We may run into multiple chains that merge into a single chain. We mark the
4324 // stores that we vectorized so that we don't visit the same store twice.
4325 BoUpSLP::ValueSet VectorizedStores;
4326 bool Changed = false;
4327
4328 // Do a quadratic search on all of the given stores in reverse order and find
4329 // all of the pairs of stores that follow each other.
4330 SmallVector<unsigned, 16> IndexQueue;
4331 unsigned E = Stores.size();
4332 IndexQueue.resize(E - 1);
4333 for (unsigned I = E; I > 0; --I) {
4334 unsigned Idx = I - 1;
4335 // If a store has multiple consecutive store candidates, search Stores
4336 // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
4337 // This is because usually pairing with immediate succeeding or preceding
4338 // candidate create the best chance to find slp vectorization opportunity.
4339 unsigned Offset = 1;
4340 unsigned Cnt = 0;
4341 for (unsigned J = 0; J < E - 1; ++J, ++Offset) {
4342 if (Idx >= Offset) {
4343 IndexQueue[Cnt] = Idx - Offset;
4344 ++Cnt;
4345 }
4346 if (Idx + Offset < E) {
4347 IndexQueue[Cnt] = Idx + Offset;
4348 ++Cnt;
4349 }
4350 }
4351
4352 for (auto K : IndexQueue) {
4353 if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) {
4354 Tails.insert(Stores[Idx]);
4355 Heads.insert(Stores[K]);
4356 ConsecutiveChain[Stores[K]] = Stores[Idx];
4357 break;
4358 }
4359 }
4360 }
4361
4362 // For stores that start but don't end a link in the chain:
4363 for (auto *SI : llvm::reverse(Heads)) {
4364 if (Tails.count(SI))
4365 continue;
4366
4367 // We found a store instr that starts a chain. Now follow the chain and try
4368 // to vectorize it.
4369 BoUpSLP::ValueList Operands;
4370 StoreInst *I = SI;
4371 // Collect the chain into a list.
4372 while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {
4373 Operands.push_back(I);
4374 // Move to the next value in the chain.
4375 I = ConsecutiveChain[I];
4376 }
4377
4378 // FIXME: Is division-by-2 the correct step? Should we assert that the
4379 // register size is a power-of-2?
4380 for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
4381 Size /= 2) {
4382 if (vectorizeStoreChain(Operands, R, Size)) {
4383 // Mark the vectorized stores so that we don't vectorize them again.
4384 VectorizedStores.insert(Operands.begin(), Operands.end());
4385 Changed = true;
4386 break;
4387 }
4388 }
4389 }
4390
4391 return Changed;
4392}
4393
4394void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
4395 // Initialize the collections. We will make a single pass over the block.
4396 Stores.clear();
4397 GEPs.clear();
4398
4399 // Visit the store and getelementptr instructions in BB and organize them in
4400 // Stores and GEPs according to the underlying objects of their pointer
4401 // operands.
4402 for (Instruction &I : *BB) {
4403 // Ignore store instructions that are volatile or have a pointer operand
4404 // that doesn't point to a scalar type.
4405 if (auto *SI = dyn_cast<StoreInst>(&I)) {
4406 if (!SI->isSimple())
4407 continue;
4408 if (!isValidElementType(SI->getValueOperand()->getType()))
4409 continue;
4410 Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
4411 }
4412
4413 // Ignore getelementptr instructions that have more than one index, a
4414 // constant index, or a pointer operand that doesn't point to a scalar
4415 // type.
4416 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
4417 auto Idx = GEP->idx_begin()->get();
4418 if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
4419 continue;
4420 if (!isValidElementType(Idx->getType()))
4421 continue;
4422 if (GEP->getType()->isVectorTy())
4423 continue;
4424 GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
4425 }
4426 }
4427}
4428
4429bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
4430 if (!A || !B)
4431 return false;
4432 Value *VL[] = { A, B };
4433 return tryToVectorizeList(VL, R, None, true);
4434}
4435
4436bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
4437 ArrayRef<Value *> BuildVector,
4438 bool AllowReorder) {
4439 if (VL.size() < 2)
4440 return false;
4441
4442 DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n"; } } while (false)
4443 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n"; } } while (false)
;
4444
4445 // Check that all of the parts are scalar instructions of the same type.
4446 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
4447 if (!I0)
4448 return false;
4449
4450 unsigned Opcode0 = I0->getOpcode();
4451
4452 unsigned Sz = R.getVectorElementSize(I0);
4453 unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
4454 unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
4455 if (MaxVF < 2) {
4456 R.getORE()->emit([&]() {
4457 return OptimizationRemarkMissed(
4458 SV_NAME"slp-vectorizer", "SmallVF", I0)
4459 << "Cannot SLP vectorize list: vectorization factor "
4460 << "less than 2 is not supported";
4461 });
4462 return false;
4463 }
4464
4465 for (Value *V : VL) {
4466 Type *Ty = V->getType();
4467 if (!isValidElementType(Ty)) {
4468 // NOTE: the following will give user internal llvm type name, which may not be useful
4469 R.getORE()->emit([&]() {
4470 std::string type_str;
4471 llvm::raw_string_ostream rso(type_str);
4472 Ty->print(rso);
4473 return OptimizationRemarkMissed(
4474 SV_NAME"slp-vectorizer", "UnsupportedType", I0)
4475 << "Cannot SLP vectorize list: type "
4476 << rso.str() + " is unsupported by vectorizer";
4477 });
4478 return false;
4479 }
4480 Instruction *Inst = dyn_cast<Instruction>(V);
4481
4482 if (!Inst)
4483 return false;
4484 if (Inst->getOpcode() != Opcode0) {
4485 R.getORE()->emit([&]() {
4486 return OptimizationRemarkMissed(
4487 SV_NAME"slp-vectorizer", "InequableTypes", I0)
4488 << "Cannot SLP vectorize list: not all of the "
4489 << "parts of scalar instructions are of the same type: "
4490 << ore::NV("Instruction1Opcode", I0) << " and "
4491 << ore::NV("Instruction2Opcode", Inst);
4492 });
4493 return false;
4494 }
4495 }
4496
4497 bool Changed = false;
4498 bool CandidateFound = false;
4499 int MinCost = SLPCostThreshold;
4500
4501 // Keep track of values that were deleted by vectorizing in the loop below.
4502 SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
4503
4504 unsigned NextInst = 0, MaxInst = VL.size();
4505 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
4506 VF /= 2) {
4507 // No actual vectorization should happen, if number of parts is the same as
4508 // provided vectorization factor (i.e. the scalar type is used for vector
4509 // code during codegen).
4510 auto *VecTy = VectorType::get(VL[0]->getType(), VF);
4511 if (TTI->getNumberOfParts(VecTy) == VF)
4512 continue;
4513 for (unsigned I = NextInst; I < MaxInst; ++I) {
4514 unsigned OpsWidth = 0;
4515
4516 if (I + VF > MaxInst)
4517 OpsWidth = MaxInst - I;
4518 else
4519 OpsWidth = VF;
4520
4521 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
4522 break;
4523
4524 // Check that a previous iteration of this loop did not delete the Value.
4525 if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
4526 continue;
4527
4528 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << OpsWidth
<< " operations " << "\n"; } } while (false)
4529 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << OpsWidth
<< " operations " << "\n"; } } while (false)
;
4530 ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
4531
4532 ArrayRef<Value *> BuildVectorSlice;
4533 if (!BuildVector.empty())
4534 BuildVectorSlice = BuildVector.slice(I, OpsWidth);
4535
4536 R.buildTree(Ops, BuildVectorSlice);
4537 // TODO: check if we can allow reordering for more cases.
4538 if (AllowReorder && R.shouldReorder()) {
4539 // Conceptually, there is nothing actually preventing us from trying to
4540 // reorder a larger list. In fact, we do exactly this when vectorizing
4541 // reductions. However, at this point, we only expect to get here when
4542 // there are exactly two operations.
4543 assert(Ops.size() == 2)(static_cast <bool> (Ops.size() == 2) ? void (0) : __assert_fail
("Ops.size() == 2", "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4543, __extension__ __PRETTY_FUNCTION__))
;
4544 assert(BuildVectorSlice.empty())(static_cast <bool> (BuildVectorSlice.empty()) ? void (
0) : __assert_fail ("BuildVectorSlice.empty()", "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4544, __extension__ __PRETTY_FUNCTION__))
;
4545 Value *ReorderedOps[] = {Ops[1], Ops[0]};
4546 R.buildTree(ReorderedOps, None);
4547 }
4548 if (R.isTreeTinyAndNotFullyVectorizable())
4549 continue;
4550
4551 R.computeMinimumValueSizes();
4552 int Cost = R.getTreeCost();
4553 CandidateFound = true;
4554 MinCost = std::min(MinCost, Cost);
4555
4556 if (Cost < -SLPCostThreshold) {
4557 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing list at cost:" <<
Cost << ".\n"; } } while (false)
;
4558 R.getORE()->emit(OptimizationRemark(SV_NAME"slp-vectorizer", "VectorizedList",
4559 cast<Instruction>(Ops[0]))
4560 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
4561 << " and with tree size "
4562 << ore::NV("TreeSize", R.getTreeSize()));
4563
4564 Value *VectorizedRoot = R.vectorizeTree();
4565
4566 // Reconstruct the build vector by extracting the vectorized root. This
4567 // way we handle the case where some elements of the vector are
4568 // undefined.
4569 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
4570 if (!BuildVectorSlice.empty()) {
4571 // The insert point is the last build vector instruction. The
4572 // vectorized root will precede it. This guarantees that we get an
4573 // instruction. The vectorized tree could have been constant folded.
4574 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
4575 unsigned VecIdx = 0;
4576 for (auto &V : BuildVectorSlice) {
4577 IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
4578 ++BasicBlock::iterator(InsertAfter));
4579 Instruction *I = cast<Instruction>(V);
4580 assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I))(static_cast <bool> (isa<InsertElementInst>(I) ||
isa<InsertValueInst>(I)) ? void (0) : __assert_fail ("isa<InsertElementInst>(I) || isa<InsertValueInst>(I)"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4580, __extension__ __PRETTY_FUNCTION__))
;
4581 Instruction *Extract =
4582 cast<Instruction>(Builder.CreateExtractElement(
4583 VectorizedRoot, Builder.getInt32(VecIdx++)));
4584 I->setOperand(1, Extract);
4585 I->moveAfter(Extract);
4586 InsertAfter = I;
4587 }
4588 }
4589 // Move to the next bundle.
4590 I += VF - 1;
4591 NextInst = I + 1;
4592 Changed = true;
4593 }
4594 }
4595 }
4596
4597 if (!Changed && CandidateFound) {
4598 R.getORE()->emit([&]() {
4599 return OptimizationRemarkMissed(
4600 SV_NAME"slp-vectorizer", "NotBeneficial", I0)
4601 << "List vectorization was possible but not beneficial with cost "
4602 << ore::NV("Cost", MinCost) << " >= "
4603 << ore::NV("Treshold", -SLPCostThreshold);
4604 });
4605 } else if (!Changed) {
4606 R.getORE()->emit([&]() {
4607 return OptimizationRemarkMissed(
4608 SV_NAME"slp-vectorizer", "NotPossible", I0)
4609 << "Cannot SLP vectorize list: vectorization was impossible"
4610 << " with available vectorization factors";
4611 });
4612 }
4613 return Changed;
4614}
4615
4616bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
4617 if (!I)
4618 return false;
4619
4620 if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
4621 return false;
4622
4623 Value *P = I->getParent();
4624
4625 // Vectorize in current basic block only.
4626 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
4627 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
4628 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
4629 return false;
4630
4631 // Try to vectorize V.
4632 if (tryToVectorizePair(Op0, Op1, R))
4633 return true;
4634
4635 auto *A = dyn_cast<BinaryOperator>(Op0);
4636 auto *B = dyn_cast<BinaryOperator>(Op1);
4637 // Try to skip B.
4638 if (B && B->hasOneUse()) {
4639 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
4640 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
4641 if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
4642 return true;
4643 if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
4644 return true;
4645 }
4646
4647 // Try to skip A.
4648 if (A && A->hasOneUse()) {
4649 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
4650 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
4651 if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
4652 return true;
4653 if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
4654 return true;
4655 }
4656 return false;
4657}
4658
4659/// \brief Generate a shuffle mask to be used in a reduction tree.
4660///
4661/// \param VecLen The length of the vector to be reduced.
4662/// \param NumEltsToRdx The number of elements that should be reduced in the
4663/// vector.
4664/// \param IsPairwise Whether the reduction is a pairwise or splitting
4665/// reduction. A pairwise reduction will generate a mask of
4666/// <0,2,...> or <1,3,..> while a splitting reduction will generate
4667/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
4668/// \param IsLeft True will generate a mask of even elements, odd otherwise.
4669static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
4670 bool IsPairwise, bool IsLeft,
4671 IRBuilder<> &Builder) {
4672 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask")(static_cast <bool> ((IsPairwise || !IsLeft) &&
"Don't support a <0,1,undef,...> mask") ? void (0) : __assert_fail
("(IsPairwise || !IsLeft) && \"Don't support a <0,1,undef,...> mask\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4672, __extension__ __PRETTY_FUNCTION__))
;
4673
4674 SmallVector<Constant *, 32> ShuffleMask(
4675 VecLen, UndefValue::get(Builder.getInt32Ty()));
4676
4677 if (IsPairwise)
4678 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
4679 for (unsigned i = 0; i != NumEltsToRdx; ++i)
4680 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
4681 else
4682 // Move the upper half of the vector to the lower half.
4683 for (unsigned i = 0; i != NumEltsToRdx; ++i)
4684 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
4685
4686 return ConstantVector::get(ShuffleMask);
4687}
4688
4689namespace {
4690
4691/// Model horizontal reductions.
4692///
4693/// A horizontal reduction is a tree of reduction operations (currently add and
4694/// fadd) that has operations that can be put into a vector as its leaf.
4695/// For example, this tree:
4696///
4697/// mul mul mul mul
4698/// \ / \ /
4699/// + +
4700/// \ /
4701/// +
4702/// This tree has "mul" as its reduced values and "+" as its reduction
4703/// operations. A reduction might be feeding into a store or a binary operation
4704/// feeding a phi.
4705/// ...
4706/// \ /
4707/// +
4708/// |
4709/// phi +=
4710///
4711/// Or:
4712/// ...
4713/// \ /
4714/// +
4715/// |
4716/// *p =
4717///
4718class HorizontalReduction {
4719 using ReductionOpsType = SmallVector<Value *, 16>;
4720 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
4721 ReductionOpsListType ReductionOps;
4722 SmallVector<Value *, 32> ReducedVals;
4723 // Use map vector to make stable output.
4724 MapVector<Instruction *, Value *> ExtraArgs;
4725
4726 /// Kind of the reduction data.
4727 enum ReductionKind {
4728 RK_None, /// Not a reduction.
4729 RK_Arithmetic, /// Binary reduction data.
4730 RK_Min, /// Minimum reduction data.
4731 RK_UMin, /// Unsigned minimum reduction data.
4732 RK_Max, /// Maximum reduction data.
4733 RK_UMax, /// Unsigned maximum reduction data.
4734 };
4735
4736 /// Contains info about operation, like its opcode, left and right operands.
4737 class OperationData {
4738 /// Opcode of the instruction.
4739 unsigned Opcode = 0;
4740
4741 /// Left operand of the reduction operation.
4742 Value *LHS = nullptr;
4743
4744 /// Right operand of the reduction operation.
4745 Value *RHS = nullptr;
4746
4747 /// Kind of the reduction operation.
4748 ReductionKind Kind = RK_None;
4749
4750 /// True if float point min/max reduction has no NaNs.
4751 bool NoNaN = false;
4752
4753 /// Checks if the reduction operation can be vectorized.
4754 bool isVectorizable() const {
4755 return LHS && RHS &&
4756 // We currently only support adds && min/max reductions.
4757 ((Kind == RK_Arithmetic &&
4758 (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) ||
4759 ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
4760 (Kind == RK_Min || Kind == RK_Max)) ||
4761 (Opcode == Instruction::ICmp &&
4762 (Kind == RK_UMin || Kind == RK_UMax)));
4763 }
4764
4765 /// Creates reduction operation with the current opcode.
4766 Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
4767 assert(isVectorizable() &&(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4768, __extension__ __PRETTY_FUNCTION__))
4768 "Expected add|fadd or min/max reduction operation.")(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4768, __extension__ __PRETTY_FUNCTION__))
;
4769 Value *Cmp;
4770 switch (Kind) {
4771 case RK_Arithmetic:
4772 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
4773 Name);
4774 case RK_Min:
4775 Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
4776 : Builder.CreateFCmpOLT(LHS, RHS);
4777 break;
4778 case RK_Max:
4779 Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
4780 : Builder.CreateFCmpOGT(LHS, RHS);
4781 break;
4782 case RK_UMin:
4783 assert(Opcode == Instruction::ICmp && "Expected integer types.")(static_cast <bool> (Opcode == Instruction::ICmp &&
"Expected integer types.") ? void (0) : __assert_fail ("Opcode == Instruction::ICmp && \"Expected integer types.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4783, __extension__ __PRETTY_FUNCTION__))
;
4784 Cmp = Builder.CreateICmpULT(LHS, RHS);
4785 break;
4786 case RK_UMax:
4787 assert(Opcode == Instruction::ICmp && "Expected integer types.")(static_cast <bool> (Opcode == Instruction::ICmp &&
"Expected integer types.") ? void (0) : __assert_fail ("Opcode == Instruction::ICmp && \"Expected integer types.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4787, __extension__ __PRETTY_FUNCTION__))
;
4788 Cmp = Builder.CreateICmpUGT(LHS, RHS);
4789 break;
4790 case RK_None:
4791 llvm_unreachable("Unknown reduction operation.")::llvm::llvm_unreachable_internal("Unknown reduction operation."
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4791)
;
4792 }
4793 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
4794 }
4795
4796 public:
4797 explicit OperationData() = default;
4798
4799 /// Construction for reduced values. They are identified by opcode only and
4800 /// don't have associated LHS/RHS values.
4801 explicit OperationData(Value *V) {
4802 if (auto *I = dyn_cast<Instruction>(V))
4803 Opcode = I->getOpcode();
4804 }
4805
4806 /// Constructor for reduction operations with opcode and its left and
4807 /// right operands.
4808 OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
4809 bool NoNaN = false)
4810 : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
4811 assert(Kind != RK_None && "One of the reduction operations is expected.")(static_cast <bool> (Kind != RK_None && "One of the reduction operations is expected."
) ? void (0) : __assert_fail ("Kind != RK_None && \"One of the reduction operations is expected.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4811, __extension__ __PRETTY_FUNCTION__))
;
4812 }
4813
4814 explicit operator bool() const { return Opcode; }
4815
4816 /// Get the index of the first operand.
4817 unsigned getFirstOperandIndex() const {
4818 assert(!!*this && "The opcode is not set.")(static_cast <bool> (!!*this && "The opcode is not set."
) ? void (0) : __assert_fail ("!!*this && \"The opcode is not set.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4818, __extension__ __PRETTY_FUNCTION__))
;
4819 switch (Kind) {
4820 case RK_Min:
4821 case RK_UMin:
4822 case RK_Max:
4823 case RK_UMax:
4824 return 1;
4825 case RK_Arithmetic:
4826 case RK_None:
4827 break;
4828 }
4829 return 0;
4830 }
4831
4832 /// Total number of operands in the reduction operation.
4833 unsigned getNumberOfOperands() const {
4834 assert(Kind != RK_None && !!*this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4835, __extension__ __PRETTY_FUNCTION__))
4835 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4835, __extension__ __PRETTY_FUNCTION__))
;
4836 switch (Kind) {
4837 case RK_Arithmetic:
4838 return 2;
4839 case RK_Min:
4840 case RK_UMin:
4841 case RK_Max:
4842 case RK_UMax:
4843 return 3;
4844 case RK_None:
4845 break;
4846 }
4847 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4847)
;
4848 }
4849
4850 /// Checks if the operation has the same parent as \p P.
4851 bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
4852 assert(Kind != RK_None && !!*this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4853, __extension__ __PRETTY_FUNCTION__))
4853 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4853, __extension__ __PRETTY_FUNCTION__))
;
4854 if (!IsRedOp)
4855 return I->getParent() == P;
4856 switch (Kind) {
4857 case RK_Arithmetic:
4858 // Arithmetic reduction operation must be used once only.
4859 return I->getParent() == P;
4860 case RK_Min:
4861 case RK_UMin:
4862 case RK_Max:
4863 case RK_UMax: {
4864 // SelectInst must be used twice while the condition op must have single
4865 // use only.
4866 auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
4867 return I->getParent() == P && Cmp && Cmp->getParent() == P;
4868 }
4869 case RK_None:
4870 break;
4871 }
4872 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4872)
;
4873 }
4874 /// Expected number of uses for reduction operations/reduced values.
4875 bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
4876 assert(Kind != RK_None && !!*this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4877, __extension__ __PRETTY_FUNCTION__))
4877 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4877, __extension__ __PRETTY_FUNCTION__))
;
4878 switch (Kind) {
4879 case RK_Arithmetic:
4880 return I->hasOneUse();
4881 case RK_Min:
4882 case RK_UMin:
4883 case RK_Max:
4884 case RK_UMax:
4885 return I->hasNUses(2) &&
4886 (!IsReductionOp ||
4887 cast<SelectInst>(I)->getCondition()->hasOneUse());
4888 case RK_None:
4889 break;
4890 }
4891 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4891)
;
4892 }
4893
4894 /// Initializes the list of reduction operations.
4895 void initReductionOps(ReductionOpsListType &ReductionOps) {
4896 assert(Kind != RK_None && !!*this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4897, __extension__ __PRETTY_FUNCTION__))
4897 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4897, __extension__ __PRETTY_FUNCTION__))
;
4898 switch (Kind) {
4899 case RK_Arithmetic:
4900 ReductionOps.assign(1, ReductionOpsType());
4901 break;
4902 case RK_Min:
4903 case RK_UMin:
4904 case RK_Max:
4905 case RK_UMax:
4906 ReductionOps.assign(2, ReductionOpsType());
4907 break;
4908 case RK_None:
4909 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4909)
;
4910 }
4911 }
4912 /// Add all reduction operations for the reduction instruction \p I.
4913 void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
4914 assert(Kind != RK_None && !!*this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4915, __extension__ __PRETTY_FUNCTION__))
4915 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4915, __extension__ __PRETTY_FUNCTION__))
;
4916 switch (Kind) {
4917 case RK_Arithmetic:
4918 ReductionOps[0].emplace_back(I);
4919 break;
4920 case RK_Min:
4921 case RK_UMin:
4922 case RK_Max:
4923 case RK_UMax:
4924 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
4925 ReductionOps[1].emplace_back(I);
4926 break;
4927 case RK_None:
4928 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4928)
;
4929 }
4930 }
4931
4932 /// Checks if instruction is associative and can be vectorized.
4933 bool isAssociative(Instruction *I) const {
4934 assert(Kind != RK_None && *this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && *this &&
LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && *this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4935, __extension__ __PRETTY_FUNCTION__))
4935 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && *this &&
LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && *this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4935, __extension__ __PRETTY_FUNCTION__))
;
4936 switch (Kind) {
4937 case RK_Arithmetic:
4938 return I->isAssociative();
4939 case RK_Min:
4940 case RK_Max:
4941 return Opcode == Instruction::ICmp ||
4942 cast<Instruction>(I->getOperand(0))->isFast();
4943 case RK_UMin:
4944 case RK_UMax:
4945 assert(Opcode == Instruction::ICmp &&(static_cast <bool> (Opcode == Instruction::ICmp &&
"Only integer compare operation is expected.") ? void (0) : __assert_fail
("Opcode == Instruction::ICmp && \"Only integer compare operation is expected.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4946, __extension__ __PRETTY_FUNCTION__))
4946 "Only integer compare operation is expected.")(static_cast <bool> (Opcode == Instruction::ICmp &&
"Only integer compare operation is expected.") ? void (0) : __assert_fail
("Opcode == Instruction::ICmp && \"Only integer compare operation is expected.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4946, __extension__ __PRETTY_FUNCTION__))
;
4947 return true;
4948 case RK_None:
4949 break;
4950 }
4951 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4951)
;
4952 }
4953
4954 /// Checks if the reduction operation can be vectorized.
4955 bool isVectorizable(Instruction *I) const {
4956 return isVectorizable() && isAssociative(I);
4957 }
4958
4959 /// Checks if two operation data are both a reduction op or both a reduced
4960 /// value.
4961 bool operator==(const OperationData &OD) {
4962 assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&(static_cast <bool> (((Kind != OD.Kind) || ((!LHS == !OD
.LHS) && (!RHS == !OD.RHS))) && "One of the comparing operations is incorrect."
) ? void (0) : __assert_fail ("((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && \"One of the comparing operations is incorrect.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4963, __extension__ __PRETTY_FUNCTION__))
4963 "One of the comparing operations is incorrect.")(static_cast <bool> (((Kind != OD.Kind) || ((!LHS == !OD
.LHS) && (!RHS == !OD.RHS))) && "One of the comparing operations is incorrect."
) ? void (0) : __assert_fail ("((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && \"One of the comparing operations is incorrect.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4963, __extension__ __PRETTY_FUNCTION__))
;
4964 return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
4965 }
4966 bool operator!=(const OperationData &OD) { return !(*this == OD); }
4967 void clear() {
4968 Opcode = 0;
4969 LHS = nullptr;
4970 RHS = nullptr;
4971 Kind = RK_None;
4972 NoNaN = false;
4973 }
4974
4975 /// Get the opcode of the reduction operation.
4976 unsigned getOpcode() const {
4977 assert(isVectorizable() && "Expected vectorizable operation.")(static_cast <bool> (isVectorizable() && "Expected vectorizable operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected vectorizable operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4977, __extension__ __PRETTY_FUNCTION__))
;
4978 return Opcode;
4979 }
4980
4981 /// Get kind of reduction data.
4982 ReductionKind getKind() const { return Kind; }
4983 Value *getLHS() const { return LHS; }
4984 Value *getRHS() const { return RHS; }
4985 Type *getConditionType() const {
4986 switch (Kind) {
4987 case RK_Arithmetic:
4988 return nullptr;
4989 case RK_Min:
4990 case RK_Max:
4991 case RK_UMin:
4992 case RK_UMax:
4993 return CmpInst::makeCmpResultType(LHS->getType());
4994 case RK_None:
4995 break;
4996 }
4997 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4997)
;
4998 }
4999
5000 /// Creates reduction operation with the current opcode with the IR flags
5001 /// from \p ReductionOps.
5002 Value *createOp(IRBuilder<> &Builder, const Twine &Name,
5003 const ReductionOpsListType &ReductionOps) const {
5004 assert(isVectorizable() &&(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5005, __extension__ __PRETTY_FUNCTION__))
5005 "Expected add|fadd or min/max reduction operation.")(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5005, __extension__ __PRETTY_FUNCTION__))
;
5006 auto *Op = createOp(Builder, Name);
5007 switch (Kind) {
5008 case RK_Arithmetic:
5009 propagateIRFlags(Op, ReductionOps[0]);
5010 return Op;
5011 case RK_Min:
5012 case RK_Max:
5013 case RK_UMin:
5014 case RK_UMax:
5015 if (auto *SI = dyn_cast<SelectInst>(Op))
5016 propagateIRFlags(SI->getCondition(), ReductionOps[0]);
5017 propagateIRFlags(Op, ReductionOps[1]);
5018 return Op;
5019 case RK_None:
5020 break;
5021 }
5022 llvm_unreachable("Unknown reduction operation.")::llvm::llvm_unreachable_internal("Unknown reduction operation."
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5022)
;
5023 }
5024 /// Creates reduction operation with the current opcode with the IR flags
5025 /// from \p I.
5026 Value *createOp(IRBuilder<> &Builder, const Twine &Name,
5027 Instruction *I) const {
5028 assert(isVectorizable() &&(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5029, __extension__ __PRETTY_FUNCTION__))
5029 "Expected add|fadd or min/max reduction operation.")(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5029, __extension__ __PRETTY_FUNCTION__))
;
5030 auto *Op = createOp(Builder, Name);
5031 switch (Kind) {
5032 case RK_Arithmetic:
5033 propagateIRFlags(Op, I);
5034 return Op;
5035 case RK_Min:
5036 case RK_Max:
5037 case RK_UMin:
5038 case RK_UMax:
5039 if (auto *SI = dyn_cast<SelectInst>(Op)) {
5040 propagateIRFlags(SI->getCondition(),
5041 cast<SelectInst>(I)->getCondition());
5042 }
5043 propagateIRFlags(Op, I);
5044 return Op;
5045 case RK_None:
5046 break;
5047 }
5048 llvm_unreachable("Unknown reduction operation.")::llvm::llvm_unreachable_internal("Unknown reduction operation."
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5048)
;
5049 }
5050
5051 TargetTransformInfo::ReductionFlags getFlags() const {
5052 TargetTransformInfo::ReductionFlags Flags;
5053 Flags.NoNaN = NoNaN;
5054 switch (Kind) {
5055 case RK_Arithmetic:
5056 break;
5057 case RK_Min:
5058 Flags.IsSigned = Opcode == Instruction::ICmp;
5059 Flags.IsMaxOp = false;
5060 break;
5061 case RK_Max:
5062 Flags.IsSigned = Opcode == Instruction::ICmp;
5063 Flags.IsMaxOp = true;
5064 break;
5065 case RK_UMin:
5066 Flags.IsSigned = false;
5067 Flags.IsMaxOp = false;
5068 break;
5069 case RK_UMax:
5070 Flags.IsSigned = false;
5071 Flags.IsMaxOp = true;
5072 break;
5073 case RK_None:
5074 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5074)
;
5075 }
5076 return Flags;
5077 }
5078 };
5079
5080 Instruction *ReductionRoot = nullptr;
5081
5082 /// The operation data of the reduction operation.
5083 OperationData ReductionData;
5084
5085 /// The operation data of the values we perform a reduction on.
5086 OperationData ReducedValueData;
5087
5088 /// Should we model this reduction as a pairwise reduction tree or a tree that
5089 /// splits the vector in halves and adds those halves.
5090 bool IsPairwiseReduction = false;
5091
5092 /// Checks if the ParentStackElem.first should be marked as a reduction
5093 /// operation with an extra argument or as extra argument itself.
5094 void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
5095 Value *ExtraArg) {
5096 if (ExtraArgs.count(ParentStackElem.first)) {
5097 ExtraArgs[ParentStackElem.first] = nullptr;
5098 // We ran into something like:
5099 // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
5100 // The whole ParentStackElem.first should be considered as an extra value
5101 // in this case.
5102 // Do not perform analysis of remaining operands of ParentStackElem.first
5103 // instruction, this whole instruction is an extra argument.
5104 ParentStackElem.second = ParentStackElem.first->getNumOperands();
5105 } else {
5106 // We ran into something like:
5107 // ParentStackElem.first += ... + ExtraArg + ...
5108 ExtraArgs[ParentStackElem.first] = ExtraArg;
5109 }
5110 }
5111
5112 static OperationData getOperationData(Value *V) {
5113 if (!V)
5114 return OperationData();
5115
5116 Value *LHS;
5117 Value *RHS;
5118 if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
5119 return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
5120 RK_Arithmetic);
5121 }
5122 if (auto *Select = dyn_cast<SelectInst>(V)) {
5123 // Look for a min/max pattern.
5124 if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
5125 return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
5126 } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
5127 return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
5128 } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
5129 m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
5130 return OperationData(
5131 Instruction::FCmp, LHS, RHS, RK_Min,
5132 cast<Instruction>(Select->getCondition())->hasNoNaNs());
5133 } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
5134 return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
5135 } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
5136 return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
5137 } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
5138 m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
5139 return OperationData(
5140 Instruction::FCmp, LHS, RHS, RK_Max,
5141 cast<Instruction>(Select->getCondition())->hasNoNaNs());
5142 }
5143 }
5144 return OperationData(V);
5145 }
5146
5147public:
5148 HorizontalReduction() = default;
5149
5150 /// \brief Try to find a reduction tree.
5151 bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
5152 assert((!Phi || is_contained(Phi->operands(), B)) &&(static_cast <bool> ((!Phi || is_contained(Phi->operands
(), B)) && "Thi phi needs to use the binary operator"
) ? void (0) : __assert_fail ("(!Phi || is_contained(Phi->operands(), B)) && \"Thi phi needs to use the binary operator\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5153, __extension__ __PRETTY_FUNCTION__))
5153 "Thi phi needs to use the binary operator")(static_cast <bool> ((!Phi || is_contained(Phi->operands
(), B)) && "Thi phi needs to use the binary operator"
) ? void (0) : __assert_fail ("(!Phi || is_contained(Phi->operands(), B)) && \"Thi phi needs to use the binary operator\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5153, __extension__ __PRETTY_FUNCTION__))
;
5154
5155 ReductionData = getOperationData(B);
5156
5157 // We could have a initial reductions that is not an add.
5158 // r *= v1 + v2 + v3 + v4
5159 // In such a case start looking for a tree rooted in the first '+'.
5160 if (Phi) {
5161 if (ReductionData.getLHS() == Phi) {
5162 Phi = nullptr;
5163 B = dyn_cast<Instruction>(ReductionData.getRHS());
5164 ReductionData = getOperationData(B);
5165 } else if (ReductionData.getRHS() == Phi) {
5166 Phi = nullptr;
5167 B = dyn_cast<Instruction>(ReductionData.getLHS());
5168 ReductionData = getOperationData(B);
5169 }
5170 }
5171
5172 if (!ReductionData.isVectorizable(B))
5173 return false;
5174
5175 Type *Ty = B->getType();
5176 if (!isValidElementType(Ty))
5177 return false;
5178
5179 ReducedValueData.clear();
5180 ReductionRoot = B;
5181
5182 // Post order traverse the reduction tree starting at B. We only handle true
5183 // trees containing only binary operators.
5184 SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
5185 Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
5186 ReductionData.initReductionOps(ReductionOps);
5187 while (!Stack.empty()) {
5188 Instruction *TreeN = Stack.back().first;
5189 unsigned EdgeToVist = Stack.back().second++;
5190 OperationData OpData = getOperationData(TreeN);
5191 bool IsReducedValue = OpData != ReductionData;
5192
5193 // Postorder vist.
5194 if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
5195 if (IsReducedValue)
5196 ReducedVals.push_back(TreeN);
5197 else {
5198 auto I = ExtraArgs.find(TreeN);
5199 if (I != ExtraArgs.end() && !I->second) {
5200 // Check if TreeN is an extra argument of its parent operation.
5201 if (Stack.size() <= 1) {
5202 // TreeN can't be an extra argument as it is a root reduction
5203 // operation.
5204 return false;
5205 }
5206 // Yes, TreeN is an extra argument, do not add it to a list of
5207 // reduction operations.
5208 // Stack[Stack.size() - 2] always points to the parent operation.
5209 markExtraArg(Stack[Stack.size() - 2], TreeN);
5210 ExtraArgs.erase(TreeN);
5211 } else
5212 ReductionData.addReductionOps(TreeN, ReductionOps);
5213 }
5214 // Retract.
5215 Stack.pop_back();
5216 continue;
5217 }
5218
5219 // Visit left or right.
5220 Value *NextV = TreeN->getOperand(EdgeToVist);
5221 if (NextV != Phi) {
5222 auto *I = dyn_cast<Instruction>(NextV);
5223 OpData = getOperationData(I);
5224 // Continue analysis if the next operand is a reduction operation or
5225 // (possibly) a reduced value. If the reduced value opcode is not set,
5226 // the first met operation != reduction operation is considered as the
5227 // reduced value class.
5228 if (I && (!ReducedValueData || OpData == ReducedValueData ||
5229 OpData == ReductionData)) {
5230 const bool IsReductionOperation = OpData == ReductionData;
5231 // Only handle trees in the current basic block.
5232 if (!ReductionData.hasSameParent(I, B->getParent(),
5233 IsReductionOperation)) {
5234 // I is an extra argument for TreeN (its parent operation).
5235 markExtraArg(Stack.back(), I);
5236 continue;
5237 }
5238
5239 // Each tree node needs to have minimal number of users except for the
5240 // ultimate reduction.
5241 if (!ReductionData.hasRequiredNumberOfUses(I,
5242 OpData == ReductionData) &&
5243 I != B) {
5244 // I is an extra argument for TreeN (its parent operation).
5245 markExtraArg(Stack.back(), I);
5246 continue;
5247 }
5248
5249 if (IsReductionOperation) {
5250 // We need to be able to reassociate the reduction operations.
5251 if (!OpData.isAssociative(I)) {
5252 // I is an extra argument for TreeN (its parent operation).
5253 markExtraArg(Stack.back(), I);
5254 continue;
5255 }
5256 } else if (ReducedValueData &&
5257 ReducedValueData != OpData) {
5258 // Make sure that the opcodes of the operations that we are going to
5259 // reduce match.
5260 // I is an extra argument for TreeN (its parent operation).
5261 markExtraArg(Stack.back(), I);
5262 continue;
5263 } else if (!ReducedValueData)
5264 ReducedValueData = OpData;
5265
5266 Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
5267 continue;
5268 }
5269 }
5270 // NextV is an extra argument for TreeN (its parent operation).
5271 markExtraArg(Stack.back(), NextV);
5272 }
5273 return true;
5274 }
5275
5276 /// \brief Attempt to vectorize the tree found by
5277 /// matchAssociativeReduction.
5278 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
5279 if (ReducedVals.empty())
5280 return false;
5281
5282 // If there is a sufficient number of reduction values, reduce
5283 // to a nearby power-of-2. Can safely generate oversized
5284 // vectors and rely on the backend to split them to legal sizes.
5285 unsigned NumReducedVals = ReducedVals.size();
5286 if (NumReducedVals < 4)
5287 return false;
5288
5289 unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
5290
5291 Value *VectorizedTree = nullptr;
5292 IRBuilder<> Builder(ReductionRoot);
5293 FastMathFlags Unsafe;
5294 Unsafe.setFast();
5295 Builder.setFastMathFlags(Unsafe);
5296 unsigned i = 0;
5297
5298 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
5299 // The same extra argument may be used several time, so log each attempt
5300 // to use it.
5301 for (auto &Pair : ExtraArgs)
5302 ExternallyUsedValues[Pair.second].push_back(Pair.first);
5303 SmallVector<Value *, 16> IgnoreList;
5304 for (auto &V : ReductionOps)
5305 IgnoreList.append(V.begin(), V.end());
5306 while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
5307 auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
5308 V.buildTree(VL, ExternallyUsedValues, IgnoreList);
5309 if (V.shouldReorder()) {
5310 SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
5311 V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
5312 }
5313 if (V.isTreeTinyAndNotFullyVectorizable())
5314 break;
5315
5316 V.computeMinimumValueSizes();
5317
5318 // Estimate cost.
5319 int Cost =
5320 V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
5321 if (Cost >= -SLPCostThreshold) {
5322 V.getORE()->emit([&]() {
5323 return OptimizationRemarkMissed(
5324 SV_NAME"slp-vectorizer", "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
5325 << "Vectorizing horizontal reduction is possible"
5326 << "but not beneficial with cost "
5327 << ore::NV("Cost", Cost) << " and threshold "
5328 << ore::NV("Threshold", -SLPCostThreshold);
5329 });
5330 break;
5331 }
5332
5333 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Costdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n"; } } while (false)
5334 << ". (HorRdx)\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n"; } } while (false)
;
5335 V.getORE()->emit([&]() {
5336 return OptimizationRemark(
5337 SV_NAME"slp-vectorizer", "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
5338 << "Vectorized horizontal reduction with cost "
5339 << ore::NV("Cost", Cost) << " and with tree size "
5340 << ore::NV("TreeSize", V.getTreeSize());
5341 });
5342
5343 // Vectorize a tree.
5344 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
5345 Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
5346
5347 // Emit a reduction.
5348 Value *ReducedSubTree =
5349 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
5350 if (VectorizedTree) {
5351 Builder.SetCurrentDebugLocation(Loc);
5352 OperationData VectReductionData(ReductionData.getOpcode(),
5353 VectorizedTree, ReducedSubTree,
5354 ReductionData.getKind());
5355 VectorizedTree =
5356 VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
5357 } else
5358 VectorizedTree = ReducedSubTree;
5359 i += ReduxWidth;
5360 ReduxWidth = PowerOf2Floor(NumReducedVals - i);
5361 }
5362
5363 if (VectorizedTree) {
5364 // Finish the reduction.
5365 for (; i < NumReducedVals; ++i) {
5366 auto *I = cast<Instruction>(ReducedVals[i]);
5367 Builder.SetCurrentDebugLocation(I->getDebugLoc());
5368 OperationData VectReductionData(ReductionData.getOpcode(),
5369 VectorizedTree, I,
5370 ReductionData.getKind());
5371 VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
5372 }
5373 for (auto &Pair : ExternallyUsedValues) {
5374 assert(!Pair.second.empty() &&(static_cast <bool> (!Pair.second.empty() && "At least one DebugLoc must be inserted"
) ? void (0) : __assert_fail ("!Pair.second.empty() && \"At least one DebugLoc must be inserted\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5375, __extension__ __PRETTY_FUNCTION__))
5375 "At least one DebugLoc must be inserted")(static_cast <bool> (!Pair.second.empty() && "At least one DebugLoc must be inserted"
) ? void (0) : __assert_fail ("!Pair.second.empty() && \"At least one DebugLoc must be inserted\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5375, __extension__ __PRETTY_FUNCTION__))
;
5376 // Add each externally used value to the final reduction.
5377 for (auto *I : Pair.second) {
5378 Builder.SetCurrentDebugLocation(I->getDebugLoc());
5379 OperationData VectReductionData(ReductionData.getOpcode(),
5380 VectorizedTree, Pair.first,
5381 ReductionData.getKind());
5382 VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
5383 }
5384 }
5385 // Update users.
5386 ReductionRoot->replaceAllUsesWith(VectorizedTree);
5387 }
5388 return VectorizedTree != nullptr;
5389 }
5390
5391 unsigned numReductionValues() const {
5392 return ReducedVals.size();
5393 }
5394
5395private:
5396 /// \brief Calculate the cost of a reduction.
5397 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
5398 unsigned ReduxWidth) {
5399 Type *ScalarTy = FirstReducedVal->getType();
5400 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
5401
5402 int PairwiseRdxCost;
5403 int SplittingRdxCost;
5404 switch (ReductionData.getKind()) {
5405 case RK_Arithmetic:
5406 PairwiseRdxCost =
5407 TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
5408 /*IsPairwiseForm=*/true);
5409 SplittingRdxCost =
5410 TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
5411 /*IsPairwiseForm=*/false);
5412 break;
5413 case RK_Min:
5414 case RK_Max:
5415 case RK_UMin:
5416 case RK_UMax: {
5417 Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
5418 bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
5419 ReductionData.getKind() == RK_UMax;
5420 PairwiseRdxCost =
5421 TTI->getMinMaxReductionCost(VecTy, VecCondTy,
5422 /*IsPairwiseForm=*/true, IsUnsigned);
5423 SplittingRdxCost =
5424 TTI->getMinMaxReductionCost(VecTy, VecCondTy,
5425 /*IsPairwiseForm=*/false, IsUnsigned);
5426 break;
5427 }
5428 case RK_None:
5429 llvm_unreachable("Expected arithmetic or min/max reduction operation")::llvm::llvm_unreachable_internal("Expected arithmetic or min/max reduction operation"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5429)
;
5430 }
5431
5432 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
5433 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
5434
5435 int ScalarReduxCost;
5436 switch (ReductionData.getKind()) {
5437 case RK_Arithmetic:
5438 ScalarReduxCost =
5439 TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
5440 break;
5441 case RK_Min:
5442 case RK_Max:
5443 case RK_UMin:
5444 case RK_UMax:
5445 ScalarReduxCost =
5446 TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
5447 TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
5448 CmpInst::makeCmpResultType(ScalarTy));
5449 break;
5450 case RK_None:
5451 llvm_unreachable("Expected arithmetic or min/max reduction operation")::llvm::llvm_unreachable_internal("Expected arithmetic or min/max reduction operation"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5451)
;
5452 }
5453 ScalarReduxCost *= (ReduxWidth - 1);
5454
5455 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VecReduxCost
- ScalarReduxCost << " for reduction that starts with "
<< *FirstReducedVal << " (It is a " << (IsPairwiseReduction
? "pairwise" : "splitting") << " reduction)\n"; } } while
(false)
5456 << " for reduction that starts with " << *FirstReducedValdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VecReduxCost
- ScalarReduxCost << " for reduction that starts with "
<< *FirstReducedVal << " (It is a " << (IsPairwiseReduction
? "pairwise" : "splitting") << " reduction)\n"; } } while
(false)
5457 << " (It is a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VecReduxCost
- ScalarReduxCost << " for reduction that starts with "
<< *FirstReducedVal << " (It is a " << (IsPairwiseReduction
? "pairwise" : "splitting") << " reduction)\n"; } } while
(false)
5458 << (IsPairwiseReduction ? "pairwise" : "splitting")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VecReduxCost
- ScalarReduxCost << " for reduction that starts with "
<< *FirstReducedVal << " (It is a " << (IsPairwiseReduction
? "pairwise" : "splitting") << " reduction)\n"; } } while
(false)
5459 << " reduction)\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VecReduxCost
- ScalarReduxCost << " for reduction that starts with "
<< *FirstReducedVal << " (It is a " << (IsPairwiseReduction
? "pairwise" : "splitting") << " reduction)\n"; } } while
(false)
;
5460
5461 return VecReduxCost - ScalarReduxCost;
5462 }
5463
5464 /// \brief Emit a horizontal reduction of the vectorized value.
5465 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
5466 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
5467 assert(VectorizedValue && "Need to have a vectorized tree node")(static_cast <bool> (VectorizedValue && "Need to have a vectorized tree node"
) ? void (0) : __assert_fail ("VectorizedValue && \"Need to have a vectorized tree node\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5467, __extension__ __PRETTY_FUNCTION__))
;
5468 assert(isPowerOf2_32(ReduxWidth) &&(static_cast <bool> (isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now") ? void (0)
: __assert_fail ("isPowerOf2_32(ReduxWidth) && \"We only handle power-of-two reductions for now\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5469, __extension__ __PRETTY_FUNCTION__))
5469 "We only handle power-of-two reductions for now")(static_cast <bool> (isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now") ? void (0)
: __assert_fail ("isPowerOf2_32(ReduxWidth) && \"We only handle power-of-two reductions for now\""
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5469, __extension__ __PRETTY_FUNCTION__))
;
5470
5471 if (!IsPairwiseReduction)
5472 return createSimpleTargetReduction(
5473 Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
5474 ReductionData.getFlags(), ReductionOps.back());
5475
5476 Value *TmpVec = VectorizedValue;
5477 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
5478 Value *LeftMask =
5479 createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
5480 Value *RightMask =
5481 createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
5482
5483 Value *LeftShuf = Builder.CreateShuffleVector(
5484 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
5485 Value *RightShuf = Builder.CreateShuffleVector(
5486 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
5487 "rdx.shuf.r");
5488 OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
5489 RightShuf, ReductionData.getKind());
5490 TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
5491 }
5492
5493 // The result is in the first element of the vector.
5494 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
5495 }
5496};
5497
5498} // end anonymous namespace
5499
5500/// \brief Recognize construction of vectors like
5501/// %ra = insertelement <4 x float> undef, float %s0, i32 0
5502/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
5503/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
5504/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
5505/// starting from the last insertelement instruction.
5506///
5507/// Returns true if it matches
5508static bool findBuildVector(InsertElementInst *LastInsertElem,
5509 SmallVectorImpl<Value *> &BuildVector,
5510 SmallVectorImpl<Value *> &BuildVectorOpds) {
5511 Value *V = nullptr;
5512 do {
5513 BuildVector.push_back(LastInsertElem);
5514 BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
5515 V = LastInsertElem->getOperand(0);
5516 if (isa<UndefValue>(V))
5517 break;
5518 LastInsertElem = dyn_cast<InsertElementInst>(V);
5519 if (!LastInsertElem || !LastInsertElem->hasOneUse())
5520 return false;
5521 } while (true);
5522 std::reverse(BuildVector.begin(), BuildVector.end());
5523 std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
5524 return true;
5525}
5526
5527/// \brief Like findBuildVector, but looks for construction of aggregate.
5528///
5529/// \return true if it matches.
5530static bool findBuildAggregate(InsertValueInst *IV,
5531 SmallVectorImpl<Value *> &BuildVector,
5532 SmallVectorImpl<Value *> &BuildVectorOpds) {
5533 Value *V;
5534 do {
5535 BuildVector.push_back(IV);
5536 BuildVectorOpds.push_back(IV->getInsertedValueOperand());
5537 V = IV->getAggregateOperand();
5538 if (isa<UndefValue>(V))
5539 break;
5540 IV = dyn_cast<InsertValueInst>(V);
5541 if (!IV || !IV->hasOneUse())
5542 return false;
5543 } while (true);
5544 std::reverse(BuildVector.begin(), BuildVector.end());
5545 std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
5546 return true;
5547}
5548
5549static bool PhiTypeSorterFunc(Value *V, Value *V2) {
5550 return V->getType() < V2->getType();
5551}
5552
5553/// \brief Try and get a reduction value from a phi node.
5554///
5555/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
5556/// if they come from either \p ParentBB or a containing loop latch.
5557///
5558/// \returns A candidate reduction value if possible, or \code nullptr \endcode
5559/// if not possible.
5560static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
5561 BasicBlock *ParentBB, LoopInfo *LI) {
5562 // There are situations where the reduction value is not dominated by the
5563 // reduction phi. Vectorizing such cases has been reported to cause
5564 // miscompiles. See PR25787.
5565 auto DominatedReduxValue = [&](Value *R) {
5566 return (
5567 dyn_cast<Instruction>(R) &&
5568 DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
5569 };
5570
5571 Value *Rdx = nullptr;
5572
5573 // Return the incoming value if it comes from the same BB as the phi node.
5574 if (P->getIncomingBlock(0) == ParentBB) {
5575 Rdx = P->getIncomingValue(0);
5576 } else if (P->getIncomingBlock(1) == ParentBB) {
5577 Rdx = P->getIncomingValue(1);
5578 }
5579
5580 if (Rdx && DominatedReduxValue(Rdx))
5581 return Rdx;
5582
5583 // Otherwise, check whether we have a loop latch to look at.
5584 Loop *BBL = LI->getLoopFor(ParentBB);
5585 if (!BBL)
5586 return nullptr;
5587 BasicBlock *BBLatch = BBL->getLoopLatch();
5588 if (!BBLatch)
5589 return nullptr;
5590
5591 // There is a loop latch, return the incoming value if it comes from
5592 // that. This reduction pattern occasionally turns up.
5593 if (P->getIncomingBlock(0) == BBLatch) {
5594 Rdx = P->getIncomingValue(0);
5595 } else if (P->getIncomingBlock(1) == BBLatch) {
5596 Rdx = P->getIncomingValue(1);
5597 }
5598
5599 if (Rdx && DominatedReduxValue(Rdx))
5600 return Rdx;
5601
5602 return nullptr;
5603}
5604
5605/// Attempt to reduce a horizontal reduction.
5606/// If it is legal to match a horizontal reduction feeding the phi node \a P
5607/// with reduction operators \a Root (or one of its operands) in a basic block
5608/// \a BB, then check if it can be done. If horizontal reduction is not found
5609/// and root instruction is a binary operation, vectorization of the operands is
5610/// attempted.
5611/// \returns true if a horizontal reduction was matched and reduced or operands
5612/// of one of the binary instruction were vectorized.
5613/// \returns false if a horizontal reduction was not matched (or not possible)
5614/// or no vectorization of any binary operation feeding \a Root instruction was
5615/// performed.
5616static bool tryToVectorizeHorReductionOrInstOperands(
5617 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
5618 TargetTransformInfo *TTI,
5619 const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
5620 if (!ShouldVectorizeHor)
5621 return false;
5622
5623 if (!Root)
5624 return false;
5625
5626 if (Root->getParent() != BB || isa<PHINode>(Root))
5627 return false;
5628 // Start analysis starting from Root instruction. If horizontal reduction is
5629 // found, try to vectorize it. If it is not a horizontal reduction or
5630 // vectorization is not possible or not effective, and currently analyzed
5631 // instruction is a binary operation, try to vectorize the operands, using
5632 // pre-order DFS traversal order. If the operands were not vectorized, repeat
5633 // the same procedure considering each operand as a possible root of the
5634 // horizontal reduction.
5635 // Interrupt the process if the Root instruction itself was vectorized or all
5636 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
5637 SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
5638 SmallSet<Value *, 8> VisitedInstrs;
5639 bool Res = false;
5640 while (!Stack.empty()) {
5641 Value *V;
5642 unsigned Level;
5643 std::tie(V, Level) = Stack.pop_back_val();
5644 if (!V)
5645 continue;
5646 auto *Inst = dyn_cast<Instruction>(V);
5647 if (!Inst)
5648 continue;
5649 auto *BI = dyn_cast<BinaryOperator>(Inst);
5650 auto *SI = dyn_cast<SelectInst>(Inst);
5651 if (BI || SI) {
5652 HorizontalReduction HorRdx;
5653 if (HorRdx.matchAssociativeReduction(P, Inst)) {
5654 if (HorRdx.tryToReduce(R, TTI)) {
5655 Res = true;
5656 // Set P to nullptr to avoid re-analysis of phi node in
5657 // matchAssociativeReduction function unless this is the root node.
5658 P = nullptr;
5659 continue;
5660 }
5661 }
5662 if (P && BI) {
5663 Inst = dyn_cast<Instruction>(BI->getOperand(0));
5664 if (Inst == P)
5665 Inst = dyn_cast<Instruction>(BI->getOperand(1));
5666 if (!Inst) {
5667 // Set P to nullptr to avoid re-analysis of phi node in
5668 // matchAssociativeReduction function unless this is the root node.
5669 P = nullptr;
5670 continue;
5671 }
5672 }
5673 }
5674 // Set P to nullptr to avoid re-analysis of phi node in
5675 // matchAssociativeReduction function unless this is the root node.
5676 P = nullptr;
5677 if (Vectorize(Inst, R)) {
5678 Res = true;
5679 continue;
5680 }
5681
5682 // Try to vectorize operands.
5683 // Continue analysis for the instruction from the same basic block only to
5684 // save compile time.
5685 if (++Level < RecursionMaxDepth)
5686 for (auto *Op : Inst->operand_values())
5687 if (VisitedInstrs.insert(Op).second)
5688 if (auto *I = dyn_cast<Instruction>(Op))
5689 if (!isa<PHINode>(I) && I->getParent() == BB)
5690 Stack.emplace_back(Op, Level);
5691 }
5692 return Res;
5693}
5694
5695bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
5696 BasicBlock *BB, BoUpSLP &R,
5697 TargetTransformInfo *TTI) {
5698 if (!V)
5699 return false;
5700 auto *I = dyn_cast<Instruction>(V);
5701 if (!I)
5702 return false;
5703
5704 if (!isa<BinaryOperator>(I))
5705 P = nullptr;
5706 // Try to match and vectorize a horizontal reduction.
5707 auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
5708 return tryToVectorize(I, R);
5709 };
5710 return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
5711 ExtraVectorization);
5712}
5713
5714bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
5715 BasicBlock *BB, BoUpSLP &R) {
5716 const DataLayout &DL = BB->getModule()->getDataLayout();
5717 if (!R.canMapToVector(IVI->getType(), DL))
5718 return false;
5719
5720 SmallVector<Value *, 16> BuildVector;
5721 SmallVector<Value *, 16> BuildVectorOpds;
5722 if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
5723 return false;
5724
5725 DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: array mappable to vector: " <<
*IVI << "\n"; } } while (false)
;
5726 return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false);
5727}
5728
5729bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
5730 BasicBlock *BB, BoUpSLP &R) {
5731 SmallVector<Value *, 16> BuildVector;
5732 SmallVector<Value *, 16> BuildVectorOpds;
5733 if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
5734 return false;
5735
5736 // Vectorize starting with the build vector operands ignoring the BuildVector
5737 // instructions for the purpose of scheduling and user extraction.
5738 return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
5739}
5740
5741bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
5742 BoUpSLP &R) {
5743 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
5744 return true;
5745
5746 bool OpsChanged = false;
5747 for (int Idx = 0; Idx < 2; ++Idx) {
5748 OpsChanged |=
5749 vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
5750 }
5751 return OpsChanged;
5752}
5753
5754bool SLPVectorizerPass::vectorizeSimpleInstructions(
5755 SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
5756 bool OpsChanged = false;
5757 for (auto &VH : reverse(Instructions)) {
5758 auto *I = dyn_cast_or_null<Instruction>(VH);
5759 if (!I)
5760 continue;
5761 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
5762 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
5763 else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
5764 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
5765 else if (auto *CI = dyn_cast<CmpInst>(I))
5766 OpsChanged |= vectorizeCmpInst(CI, BB, R);
5767 }
5768 Instructions.clear();
5769 return OpsChanged;
5770}
5771
5772bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
5773 bool Changed = false;
5774 SmallVector<Value *, 4> Incoming;
5775 SmallSet<Value *, 16> VisitedInstrs;
5776
5777 bool HaveVectorizedPhiNodes = true;
5778 while (HaveVectorizedPhiNodes) {
5779 HaveVectorizedPhiNodes = false;
5780
5781 // Collect the incoming values from the PHIs.
5782 Incoming.clear();
5783 for (Instruction &I : *BB) {
5784 PHINode *P = dyn_cast<PHINode>(&I);
5785 if (!P)
5786 break;
5787
5788 if (!VisitedInstrs.count(P))
5789 Incoming.push_back(P);
5790 }
5791
5792 // Sort by type.
5793 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
5794
5795 // Try to vectorize elements base on their type.
5796 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
5797 E = Incoming.end();
5798 IncIt != E;) {
5799
5800 // Look for the next elements with the same type.
5801 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
5802 while (SameTypeIt != E &&
5803 (*SameTypeIt)->getType() == (*IncIt)->getType()) {
5804 VisitedInstrs.insert(*SameTypeIt);
5805 ++SameTypeIt;
5806 }
5807
5808 // Try to vectorize them.
5809 unsigned NumElts = (SameTypeIt - IncIt);
5810 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { errs() << "SLP: Trying to vectorize starting at PHIs ("
<< NumElts << ")\n"; } } while (false)
;
5811 // The order in which the phi nodes appear in the program does not matter.
5812 // So allow tryToVectorizeList to reorder them if it is beneficial. This
5813 // is done when there are exactly two elements since tryToVectorizeList
5814 // asserts that there are only two values when AllowReorder is true.
5815 bool AllowReorder = NumElts == 2;
5816 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
5817 None, AllowReorder)) {
5818 // Success start over because instructions might have been changed.
5819 HaveVectorizedPhiNodes = true;
5820 Changed = true;
5821 break;
5822 }
5823
5824 // Start over at the next instruction of a different type (or the end).
5825 IncIt = SameTypeIt;
5826 }
5827 }
5828
5829 VisitedInstrs.clear();
5830
5831 SmallVector<WeakVH, 8> PostProcessInstructions;
5832 SmallDenseSet<Instruction *, 4> KeyNodes;
5833 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
5834 // We may go through BB multiple times so skip the one we have checked.
5835 if (!VisitedInstrs.insert(&*it).second) {
5836 if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
5837 vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
5838 // We would like to start over since some instructions are deleted
5839 // and the iterator may become invalid value.
5840 Changed = true;
5841 it = BB->begin();
5842 e = BB->end();
5843 }
5844 continue;
5845 }
5846
5847 if (isa<DbgInfoIntrinsic>(it))
5848 continue;
5849
5850 // Try to vectorize reductions that use PHINodes.
5851 if (PHINode *P = dyn_cast<PHINode>(it)) {
5852 // Check that the PHI is a reduction PHI.
5853 if (P->getNumIncomingValues() != 2)
5854 return Changed;
5855
5856 // Try to match and vectorize a horizontal reduction.
5857 if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
5858 TTI)) {
5859 Changed = true;
5860 it = BB->begin();
5861 e = BB->end();
5862 continue;
5863 }
5864 continue;
5865 }
5866
5867 // Ran into an instruction without users, like terminator, or function call
5868 // with ignored return value, store. Ignore unused instructions (basing on
5869 // instruction type, except for CallInst and InvokeInst).
5870 if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
5871 isa<InvokeInst>(it))) {
5872 KeyNodes.insert(&*it);
5873 bool OpsChanged = false;
5874 if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
5875 for (auto *V : it->operand_values()) {
5876 // Try to match and vectorize a horizontal reduction.
5877 OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
5878 }
5879 }
5880 // Start vectorization of post-process list of instructions from the
5881 // top-tree instructions to try to vectorize as many instructions as
5882 // possible.
5883 OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
5884 if (OpsChanged) {
5885 // We would like to start over since some instructions are deleted
5886 // and the iterator may become invalid value.
5887 Changed = true;
5888 it = BB->begin();
5889 e = BB->end();
5890 continue;
5891 }
5892 }
5893
5894 if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
5895 isa<InsertValueInst>(it))
5896 PostProcessInstructions.push_back(&*it);
5897
5898 }
5899
5900 return Changed;
5901}
5902
5903bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
5904 auto Changed = false;
5905 for (auto &Entry : GEPs) {
5906 // If the getelementptr list has fewer than two elements, there's nothing
5907 // to do.
5908 if (Entry.second.size() < 2)
5909 continue;
5910
5911 DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a getelementptr list of length "
<< Entry.second.size() << ".\n"; } } while (false
)
5912 << Entry.second.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a getelementptr list of length "
<< Entry.second.size() << ".\n"; } } while (false
)
;
5913
5914 // We process the getelementptr list in chunks of 16 (like we do for
5915 // stores) to minimize compile-time.
5916 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
5917 auto Len = std::min<unsigned>(BE - BI, 16);
5918 auto GEPList = makeArrayRef(&Entry.second[BI], Len);
5919
5920 // Initialize a set a candidate getelementptrs. Note that we use a
5921 // SetVector here to preserve program order. If the index computations
5922 // are vectorizable and begin with loads, we want to minimize the chance
5923 // of having to reorder them later.
5924 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
5925
5926 // Some of the candidates may have already been vectorized after we
5927 // initially collected them. If so, the WeakTrackingVHs will have
5928 // nullified the
5929 // values, so remove them from the set of candidates.
5930 Candidates.remove(nullptr);
5931
5932 // Remove from the set of candidates all pairs of getelementptrs with
5933 // constant differences. Such getelementptrs are likely not good
5934 // candidates for vectorization in a bottom-up phase since one can be
5935 // computed from the other. We also ensure all candidate getelementptr
5936 // indices are unique.
5937 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
5938 auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
5939 if (!Candidates.count(GEPI))
5940 continue;
5941 auto *SCEVI = SE->getSCEV(GEPList[I]);
5942 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
5943 auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
5944 auto *SCEVJ = SE->getSCEV(GEPList[J]);
5945 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
5946 Candidates.remove(GEPList[I]);
5947 Candidates.remove(GEPList[J]);
5948 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
5949 Candidates.remove(GEPList[J]);
5950 }
5951 }
5952 }
5953
5954 // We break out of the above computation as soon as we know there are
5955 // fewer than two candidates remaining.
5956 if (Candidates.size() < 2)
5957 continue;
5958
5959 // Add the single, non-constant index of each candidate to the bundle. We
5960 // ensured the indices met these constraints when we originally collected
5961 // the getelementptrs.
5962 SmallVector<Value *, 16> Bundle(Candidates.size());
5963 auto BundleIndex = 0u;
5964 for (auto *V : Candidates) {
5965 auto *GEP = cast<GetElementPtrInst>(V);
5966 auto *GEPIdx = GEP->idx_begin()->get();
5967 assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx))(static_cast <bool> (GEP->getNumIndices() == 1 || !isa
<Constant>(GEPIdx)) ? void (0) : __assert_fail ("GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx)"
, "/build/llvm-toolchain-snapshot-6.0~svn318693/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5967, __extension__ __PRETTY_FUNCTION__))
;
5968 Bundle[BundleIndex++] = GEPIdx;
5969 }
5970
5971 // Try and vectorize the indices. We are currently only interested in
5972 // gather-like cases of the form:
5973 //
5974 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
5975 //
5976 // where the loads of "a", the loads of "b", and the subtractions can be
5977 // performed in parallel. It's likely that detecting this pattern in a
5978 // bottom-up phase will be simpler and less costly than building a
5979 // full-blown top-down phase beginning at the consecutive loads.
5980 Changed |= tryToVectorizeList(Bundle, R);
5981 }
5982 }
5983 return Changed;
5984}
5985
5986bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
5987 bool Changed = false;
5988 // Attempt to sort and vectorize each of the store-groups.
5989 for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
5990 ++it) {
5991 if (it->second.size() < 2)
5992 continue;
5993
5994 DEBUG(dbgs() << "SLP: Analyzing a store chain of length "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n"; } } while (false
)
5995 << it->second.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n"; } } while (false
)
;
5996
5997 // Process the stores in chunks of 16.
5998 // TODO: The limit of 16 inhibits greater vectorization factors.
5999 // For example, AVX2 supports v32i8. Increasing this limit, however,
6000 // may cause a significant compile-time increase.
6001 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
6002 unsigned Len = std::min<unsigned>(CE - CI, 16);
6003 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
6004 }
6005 }
6006 return Changed;
6007}
6008
6009char SLPVectorizer::ID = 0;
6010
6011static const char lv_name[] = "SLP Vectorizer";
6012
6013INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)static void *initializeSLPVectorizerPassOnce(PassRegistry &
Registry) {
6014INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
6015INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
6016INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);
6017INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);
6018INITIALIZE_PASS_DEPENDENCY(LoopSimplify)initializeLoopSimplifyPass(Registry);
6019INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);
6020INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
6021INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "slp-vectorizer", &
SLPVectorizer::ID, PassInfo::NormalCtor_t(callDefaultCtor<
SLPVectorizer>), false, false); Registry.registerPass(*PI,
true); return PI; } static llvm::once_flag InitializeSLPVectorizerPassFlag
; void llvm::initializeSLPVectorizerPass(PassRegistry &Registry
) { llvm::call_once(InitializeSLPVectorizerPassFlag, initializeSLPVectorizerPassOnce
, std::ref(Registry)); }
6022
6023Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }