Bug Summary

File:lib/Transforms/Vectorize/SLPVectorizer.cpp
Warning:line 3642, column 22
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name SLPVectorizer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-eagerly-assume -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -mrelocation-model pic -pic-level 2 -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-7/lib/clang/7.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-7~svn325874/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-7~svn325874/build-llvm/include -I /build/llvm-toolchain-snapshot-7~svn325874/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/c++/7.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/x86_64-linux-gnu/c++/7.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/x86_64-linux-gnu/c++/7.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/c++/7.3.0/backward -internal-isystem /usr/include/clang/7.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-7/lib/clang/7.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-7~svn325874/build-llvm/lib/Transforms/Vectorize -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2018-02-23-163436-368-1 -x c++ /build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp

/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp

1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
11// stores that can be put together into vector-stores. Next, it attempts to
12// construct vectorizable tree using the use-def chains. If a profitable tree
13// was found, the SLP vectorizer performs vectorization on the tree.
14//
15// The pass is inspired by the work described in the paper:
16// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
17//
18//===----------------------------------------------------------------------===//
19
20#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/DenseMap.h"
23#include "llvm/ADT/DenseSet.h"
24#include "llvm/ADT/MapVector.h"
25#include "llvm/ADT/None.h"
26#include "llvm/ADT/Optional.h"
27#include "llvm/ADT/PostOrderIterator.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/SmallPtrSet.h"
31#include "llvm/ADT/SmallSet.h"
32#include "llvm/ADT/SmallVector.h"
33#include "llvm/ADT/Statistic.h"
34#include "llvm/ADT/iterator.h"
35#include "llvm/ADT/iterator_range.h"
36#include "llvm/Analysis/AliasAnalysis.h"
37#include "llvm/Analysis/CodeMetrics.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/LoopAccessAnalysis.h"
41#include "llvm/Analysis/LoopInfo.h"
42#include "llvm/Analysis/MemoryLocation.h"
43#include "llvm/Analysis/OptimizationRemarkEmitter.h"
44#include "llvm/Analysis/ScalarEvolution.h"
45#include "llvm/Analysis/ScalarEvolutionExpressions.h"
46#include "llvm/Analysis/TargetLibraryInfo.h"
47#include "llvm/Analysis/TargetTransformInfo.h"
48#include "llvm/Analysis/ValueTracking.h"
49#include "llvm/Analysis/VectorUtils.h"
50#include "llvm/IR/Attributes.h"
51#include "llvm/IR/BasicBlock.h"
52#include "llvm/IR/Constant.h"
53#include "llvm/IR/Constants.h"
54#include "llvm/IR/DataLayout.h"
55#include "llvm/IR/DebugLoc.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
62#include "llvm/IR/Instructions.h"
63#include "llvm/IR/IntrinsicInst.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/NoFolder.h"
67#include "llvm/IR/Operator.h"
68#include "llvm/IR/PassManager.h"
69#include "llvm/IR/PatternMatch.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
75#include "llvm/IR/Verifier.h"
76#include "llvm/Pass.h"
77#include "llvm/Support/Casting.h"
78#include "llvm/Support/CommandLine.h"
79#include "llvm/Support/Compiler.h"
80#include "llvm/Support/DOTGraphTraits.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/ErrorHandling.h"
83#include "llvm/Support/GraphWriter.h"
84#include "llvm/Support/KnownBits.h"
85#include "llvm/Support/MathExtras.h"
86#include "llvm/Support/raw_ostream.h"
87#include "llvm/Transforms/Utils/LoopUtils.h"
88#include "llvm/Transforms/Vectorize.h"
89#include <algorithm>
90#include <cassert>
91#include <cstdint>
92#include <iterator>
93#include <memory>
94#include <set>
95#include <string>
96#include <tuple>
97#include <utility>
98#include <vector>
99
100using namespace llvm;
101using namespace llvm::PatternMatch;
102using namespace slpvectorizer;
103
104#define SV_NAME"slp-vectorizer" "slp-vectorizer"
105#define DEBUG_TYPE"SLP" "SLP"
106
107STATISTIC(NumVectorInstructions, "Number of vector instructions generated")static llvm::Statistic NumVectorInstructions = {"SLP", "NumVectorInstructions"
, "Number of vector instructions generated", {0}, {false}}
;
108
109static cl::opt<int>
110 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
111 cl::desc("Only vectorize if you gain more than this "
112 "number "));
113
114static cl::opt<bool>
115ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
116 cl::desc("Attempt to vectorize horizontal reductions"));
117
118static cl::opt<bool> ShouldStartVectorizeHorAtStore(
119 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
120 cl::desc(
121 "Attempt to vectorize horizontal reductions feeding into a store"));
122
123static cl::opt<int>
124MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
125 cl::desc("Attempt to vectorize for this register size in bits"));
126
127/// Limits the size of scheduling regions in a block.
128/// It avoid long compile times for _very_ large blocks where vector
129/// instructions are spread over a wide range.
130/// This limit is way higher than needed by real-world functions.
131static cl::opt<int>
132ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
133 cl::desc("Limit the size of the SLP scheduling region per block"));
134
135static cl::opt<int> MinVectorRegSizeOption(
136 "slp-min-reg-size", cl::init(128), cl::Hidden,
137 cl::desc("Attempt to vectorize for this register size in bits"));
138
139static cl::opt<unsigned> RecursionMaxDepth(
140 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
141 cl::desc("Limit the recursion depth when building a vectorizable tree"));
142
143static cl::opt<unsigned> MinTreeSize(
144 "slp-min-tree-size", cl::init(3), cl::Hidden,
145 cl::desc("Only vectorize small trees if they are fully vectorizable"));
146
147static cl::opt<bool>
148 ViewSLPTree("view-slp-tree", cl::Hidden,
149 cl::desc("Display the SLP trees with Graphviz"));
150
151// Limit the number of alias checks. The limit is chosen so that
152// it has no negative effect on the llvm benchmarks.
153static const unsigned AliasedCheckLimit = 10;
154
155// Another limit for the alias checks: The maximum distance between load/store
156// instructions where alias checks are done.
157// This limit is useful for very large basic blocks.
158static const unsigned MaxMemDepDistance = 160;
159
160/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
161/// regions to be handled.
162static const int MinScheduleRegionSize = 16;
163
164/// \brief Predicate for the element types that the SLP vectorizer supports.
165///
166/// The most important thing to filter here are types which are invalid in LLVM
167/// vectors. We also filter target specific types which have absolutely no
168/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
169/// avoids spending time checking the cost model and realizing that they will
170/// be inevitably scalarized.
171static bool isValidElementType(Type *Ty) {
172 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
173 !Ty->isPPC_FP128Ty();
174}
175
176/// \returns true if all of the instructions in \p VL are in the same block or
177/// false otherwise.
178static bool allSameBlock(ArrayRef<Value *> VL) {
179 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
180 if (!I0)
181 return false;
182 BasicBlock *BB = I0->getParent();
183 for (int i = 1, e = VL.size(); i < e; i++) {
184 Instruction *I = dyn_cast<Instruction>(VL[i]);
185 if (!I)
186 return false;
187
188 if (BB != I->getParent())
189 return false;
190 }
191 return true;
192}
193
194/// \returns True if all of the values in \p VL are constants.
195static bool allConstant(ArrayRef<Value *> VL) {
196 for (Value *i : VL)
197 if (!isa<Constant>(i))
198 return false;
199 return true;
200}
201
202/// \returns True if all of the values in \p VL are identical.
203static bool isSplat(ArrayRef<Value *> VL) {
204 for (unsigned i = 1, e = VL.size(); i < e; ++i)
205 if (VL[i] != VL[0])
206 return false;
207 return true;
208}
209
210/// Checks if the vector of instructions can be represented as a shuffle, like:
211/// %x0 = extractelement <4 x i8> %x, i32 0
212/// %x3 = extractelement <4 x i8> %x, i32 3
213/// %y1 = extractelement <4 x i8> %y, i32 1
214/// %y2 = extractelement <4 x i8> %y, i32 2
215/// %x0x0 = mul i8 %x0, %x0
216/// %x3x3 = mul i8 %x3, %x3
217/// %y1y1 = mul i8 %y1, %y1
218/// %y2y2 = mul i8 %y2, %y2
219/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
220/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
221/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
222/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
223/// ret <4 x i8> %ins4
224/// can be transformed into:
225/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
226/// i32 6>
227/// %2 = mul <4 x i8> %1, %1
228/// ret <4 x i8> %2
229/// We convert this initially to something like:
230/// %x0 = extractelement <4 x i8> %x, i32 0
231/// %x3 = extractelement <4 x i8> %x, i32 3
232/// %y1 = extractelement <4 x i8> %y, i32 1
233/// %y2 = extractelement <4 x i8> %y, i32 2
234/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
235/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
236/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
237/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
238/// %5 = mul <4 x i8> %4, %4
239/// %6 = extractelement <4 x i8> %5, i32 0
240/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
241/// %7 = extractelement <4 x i8> %5, i32 1
242/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
243/// %8 = extractelement <4 x i8> %5, i32 2
244/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
245/// %9 = extractelement <4 x i8> %5, i32 3
246/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
247/// ret <4 x i8> %ins4
248/// InstCombiner transforms this into a shuffle and vector mul
249static Optional<TargetTransformInfo::ShuffleKind>
250isShuffle(ArrayRef<Value *> VL) {
251 auto *EI0 = cast<ExtractElementInst>(VL[0]);
252 unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
253 Value *Vec1 = nullptr;
254 Value *Vec2 = nullptr;
255 enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
256 ShuffleMode CommonShuffleMode = Unknown;
257 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
258 auto *EI = cast<ExtractElementInst>(VL[I]);
259 auto *Vec = EI->getVectorOperand();
260 // All vector operands must have the same number of vector elements.
261 if (Vec->getType()->getVectorNumElements() != Size)
262 return None;
263 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
264 if (!Idx)
265 return None;
266 // Undefined behavior if Idx is negative or >= Size.
267 if (Idx->getValue().uge(Size))
268 continue;
269 unsigned IntIdx = Idx->getValue().getZExtValue();
270 // We can extractelement from undef vector.
271 if (isa<UndefValue>(Vec))
272 continue;
273 // For correct shuffling we have to have at most 2 different vector operands
274 // in all extractelement instructions.
275 if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
276 return None;
277 if (CommonShuffleMode == Permute)
278 continue;
279 // If the extract index is not the same as the operation number, it is a
280 // permutation.
281 if (IntIdx != I) {
282 CommonShuffleMode = Permute;
283 continue;
284 }
285 // Check the shuffle mode for the current operation.
286 if (!Vec1)
287 Vec1 = Vec;
288 else if (Vec != Vec1)
289 Vec2 = Vec;
290 // Example: shufflevector A, B, <0,5,2,7>
291 // I is odd and IntIdx for A == I - FirstAlternate shuffle.
292 // I is even and IntIdx for B == I - FirstAlternate shuffle.
293 // Example: shufflevector A, B, <4,1,6,3>
294 // I is even and IntIdx for A == I - SecondAlternate shuffle.
295 // I is odd and IntIdx for B == I - SecondAlternate shuffle.
296 const bool IIsEven = I & 1;
297 const bool CurrVecIsA = Vec == Vec1;
298 const bool IIsOdd = !IIsEven;
299 const bool CurrVecIsB = !CurrVecIsA;
300 ShuffleMode CurrentShuffleMode =
301 ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate
302 : SecondAlternate;
303 // Common mode is not set or the same as the shuffle mode of the current
304 // operation - alternate.
305 if (CommonShuffleMode == Unknown)
306 CommonShuffleMode = CurrentShuffleMode;
307 // Common shuffle mode is not the same as the shuffle mode of the current
308 // operation - permutation.
309 if (CommonShuffleMode != CurrentShuffleMode)
310 CommonShuffleMode = Permute;
311 }
312 // If we're not crossing lanes in different vectors, consider it as blending.
313 if ((CommonShuffleMode == FirstAlternate ||
314 CommonShuffleMode == SecondAlternate) &&
315 Vec2)
316 return TargetTransformInfo::SK_Alternate;
317 // If Vec2 was never used, we have a permutation of a single vector, otherwise
318 // we have permutation of 2 vectors.
319 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
320 : TargetTransformInfo::SK_PermuteSingleSrc;
321}
322
323///\returns Opcode that can be clubbed with \p Op to create an alternate
324/// sequence which can later be merged as a ShuffleVector instruction.
325static unsigned getAltOpcode(unsigned Op) {
326 switch (Op) {
327 case Instruction::FAdd:
328 return Instruction::FSub;
329 case Instruction::FSub:
330 return Instruction::FAdd;
331 case Instruction::Add:
332 return Instruction::Sub;
333 case Instruction::Sub:
334 return Instruction::Add;
335 default:
336 return 0;
337 }
338}
339
340static bool isOdd(unsigned Value) {
341 return Value & 1;
342}
343
344static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
345 unsigned CheckedOpcode) {
346 return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
347}
348
349/// Chooses the correct key for scheduling data. If \p Op has the same (or
350/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
351/// OpValue.
352static Value *isOneOf(Value *OpValue, Value *Op) {
353 auto *I = dyn_cast<Instruction>(Op);
354 if (!I)
355 return OpValue;
356 auto *OpInst = cast<Instruction>(OpValue);
357 unsigned OpInstOpcode = OpInst->getOpcode();
358 unsigned IOpcode = I->getOpcode();
359 if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode))
360 return Op;
361 return OpValue;
362}
363
364namespace {
365
366/// Contains data for the instructions going to be vectorized.
367struct RawInstructionsData {
368 /// Main Opcode of the instructions going to be vectorized.
369 unsigned Opcode = 0;
370
371 /// The list of instructions have some instructions with alternate opcodes.
372 bool HasAltOpcodes = false;
373};
374
375} // end anonymous namespace
376
377/// Checks the list of the vectorized instructions \p VL and returns info about
378/// this list.
379static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
380 auto *I0 = dyn_cast<Instruction>(VL[0]);
381 if (!I0)
382 return {};
383 RawInstructionsData Res;
384 unsigned Opcode = I0->getOpcode();
385 // Walk through the list of the vectorized instructions
386 // in order to check its structure described by RawInstructionsData.
387 for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
388 auto *I = dyn_cast<Instruction>(VL[Cnt]);
389 if (!I)
390 return {};
391 if (Opcode != I->getOpcode())
392 Res.HasAltOpcodes = true;
393 }
394 Res.Opcode = Opcode;
395 return Res;
396}
397
398namespace {
399
400/// Main data required for vectorization of instructions.
401struct InstructionsState {
402 /// The very first instruction in the list with the main opcode.
403 Value *OpValue = nullptr;
404
405 /// The main opcode for the list of instructions.
406 unsigned Opcode = 0;
407
408 /// Some of the instructions in the list have alternate opcodes.
409 bool IsAltShuffle = false;
410
411 InstructionsState() = default;
412 InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle)
413 : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {}
414};
415
416} // end anonymous namespace
417
418/// \returns analysis of the Instructions in \p VL described in
419/// InstructionsState, the Opcode that we suppose the whole list
420/// could be vectorized even if its structure is diverse.
421static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
422 auto Res = getMainOpcode(VL);
423 unsigned Opcode = Res.Opcode;
424 if (!Res.HasAltOpcodes)
425 return InstructionsState(VL[0], Opcode, false);
426 auto *OpInst = cast<Instruction>(VL[0]);
427 unsigned AltOpcode = getAltOpcode(Opcode);
428 // Examine each element in the list instructions VL to determine
429 // if some operations there could be considered as an alternative
430 // (for example as subtraction relates to addition operation).
431 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
432 auto *I = cast<Instruction>(VL[Cnt]);
433 unsigned InstOpcode = I->getOpcode();
434 if ((Res.HasAltOpcodes &&
435 InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
436 (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
437 return InstructionsState(OpInst, 0, false);
438 }
439 }
440 return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);
441}
442
443/// \returns true if all of the values in \p VL have the same type or false
444/// otherwise.
445static bool allSameType(ArrayRef<Value *> VL) {
446 Type *Ty = VL[0]->getType();
447 for (int i = 1, e = VL.size(); i < e; i++)
448 if (VL[i]->getType() != Ty)
449 return false;
450
451 return true;
452}
453
454/// \returns True if Extract{Value,Element} instruction extracts element Idx.
455static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
456 assert(Opcode == Instruction::ExtractElement ||(static_cast <bool> (Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) ? void (0) : __assert_fail
("Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 457, __extension__ __PRETTY_FUNCTION__))
457 Opcode == Instruction::ExtractValue)(static_cast <bool> (Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) ? void (0) : __assert_fail
("Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 457, __extension__ __PRETTY_FUNCTION__))
;
458 if (Opcode == Instruction::ExtractElement) {
459 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
460 return CI && CI->getZExtValue() == Idx;
461 } else {
462 ExtractValueInst *EI = cast<ExtractValueInst>(E);
463 return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
464 }
465}
466
467/// \returns True if in-tree use also needs extract. This refers to
468/// possible scalar operand in vectorized instruction.
469static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
470 TargetLibraryInfo *TLI) {
471 unsigned Opcode = UserInst->getOpcode();
472 switch (Opcode) {
473 case Instruction::Load: {
474 LoadInst *LI = cast<LoadInst>(UserInst);
475 return (LI->getPointerOperand() == Scalar);
476 }
477 case Instruction::Store: {
478 StoreInst *SI = cast<StoreInst>(UserInst);
479 return (SI->getPointerOperand() == Scalar);
480 }
481 case Instruction::Call: {
482 CallInst *CI = cast<CallInst>(UserInst);
483 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
484 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
485 return (CI->getArgOperand(1) == Scalar);
486 }
487 LLVM_FALLTHROUGH[[clang::fallthrough]];
488 }
489 default:
490 return false;
491 }
492}
493
494/// \returns the AA location that is being access by the instruction.
495static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
496 if (StoreInst *SI = dyn_cast<StoreInst>(I))
497 return MemoryLocation::get(SI);
498 if (LoadInst *LI = dyn_cast<LoadInst>(I))
499 return MemoryLocation::get(LI);
500 return MemoryLocation();
501}
502
503/// \returns True if the instruction is not a volatile or atomic load/store.
504static bool isSimple(Instruction *I) {
505 if (LoadInst *LI = dyn_cast<LoadInst>(I))
506 return LI->isSimple();
507 if (StoreInst *SI = dyn_cast<StoreInst>(I))
508 return SI->isSimple();
509 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
510 return !MI->isVolatile();
511 return true;
512}
513
514namespace llvm {
515
516namespace slpvectorizer {
517
518/// Bottom Up SLP Vectorizer.
519class BoUpSLP {
520public:
521 using ValueList = SmallVector<Value *, 8>;
522 using InstrList = SmallVector<Instruction *, 16>;
523 using ValueSet = SmallPtrSet<Value *, 16>;
524 using StoreList = SmallVector<StoreInst *, 8>;
525 using ExtraValueToDebugLocsMap =
526 MapVector<Value *, SmallVector<Instruction *, 2>>;
527
528 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
529 TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
530 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
531 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
532 : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
533 DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
534 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
535 // Use the vector register size specified by the target unless overridden
536 // by a command-line option.
537 // TODO: It would be better to limit the vectorization factor based on
538 // data type rather than just register size. For example, x86 AVX has
539 // 256-bit registers, but it does not support integer operations
540 // at that width (that requires AVX2).
541 if (MaxVectorRegSizeOption.getNumOccurrences())
542 MaxVecRegSize = MaxVectorRegSizeOption;
543 else
544 MaxVecRegSize = TTI->getRegisterBitWidth(true);
545
546 if (MinVectorRegSizeOption.getNumOccurrences())
547 MinVecRegSize = MinVectorRegSizeOption;
548 else
549 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
550 }
551
552 /// \brief Vectorize the tree that starts with the elements in \p VL.
553 /// Returns the vectorized root.
554 Value *vectorizeTree();
555
556 /// Vectorize the tree but with the list of externally used values \p
557 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
558 /// generated extractvalue instructions.
559 Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
560
561 /// \returns the cost incurred by unwanted spills and fills, caused by
562 /// holding live values over call sites.
563 int getSpillCost();
564
565 /// \returns the vectorization cost of the subtree that starts at \p VL.
566 /// A negative number means that this is profitable.
567 int getTreeCost();
568
569 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
570 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
571 void buildTree(ArrayRef<Value *> Roots,
572 ArrayRef<Value *> UserIgnoreLst = None);
573
574 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
575 /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
576 /// into account (anf updating it, if required) list of externally used
577 /// values stored in \p ExternallyUsedValues.
578 void buildTree(ArrayRef<Value *> Roots,
579 ExtraValueToDebugLocsMap &ExternallyUsedValues,
580 ArrayRef<Value *> UserIgnoreLst = None);
581
582 /// Clear the internal data structures that are created by 'buildTree'.
583 void deleteTree() {
584 VectorizableTree.clear();
585 ScalarToTreeEntry.clear();
586 MustGather.clear();
587 ExternalUses.clear();
588 NumOpsWantToKeepOrder.clear();
589 for (auto &Iter : BlocksSchedules) {
590 BlockScheduling *BS = Iter.second.get();
591 BS->clear();
592 }
593 MinBWs.clear();
594 }
595
596 unsigned getTreeSize() const { return VectorizableTree.size(); }
597
598 /// \brief Perform LICM and CSE on the newly generated gather sequences.
599 void optimizeGatherSequence();
600
601 /// \returns true if it is beneficial to reverse the vector order.
602 bool shouldReorder() const {
603 return std::accumulate(
604 NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), 0,
605 [](int Val1,
606 const decltype(NumOpsWantToKeepOrder)::value_type &Val2) {
607 return Val1 + (Val2.second < 0 ? 1 : -1);
608 }) > 0;
609 }
610
611 /// \return The vector element size in bits to use when vectorizing the
612 /// expression tree ending at \p V. If V is a store, the size is the width of
613 /// the stored value. Otherwise, the size is the width of the largest loaded
614 /// value reaching V. This method is used by the vectorizer to calculate
615 /// vectorization factors.
616 unsigned getVectorElementSize(Value *V);
617
618 /// Compute the minimum type sizes required to represent the entries in a
619 /// vectorizable tree.
620 void computeMinimumValueSizes();
621
622 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
623 unsigned getMaxVecRegSize() const {
624 return MaxVecRegSize;
625 }
626
627 // \returns minimum vector register size as set by cl::opt.
628 unsigned getMinVecRegSize() const {
629 return MinVecRegSize;
630 }
631
632 /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
633 ///
634 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
635 unsigned canMapToVector(Type *T, const DataLayout &DL) const;
636
637 /// \returns True if the VectorizableTree is both tiny and not fully
638 /// vectorizable. We do not vectorize such trees.
639 bool isTreeTinyAndNotFullyVectorizable();
640
641 OptimizationRemarkEmitter *getORE() { return ORE; }
642
643private:
644 struct TreeEntry;
645
646 /// Checks if all users of \p I are the part of the vectorization tree.
647 bool areAllUsersVectorized(Instruction *I) const;
648
649 /// \returns the cost of the vectorizable entry.
650 int getEntryCost(TreeEntry *E);
651
652 /// This is the recursive part of buildTree.
653 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
654
655 /// \returns True if the ExtractElement/ExtractValue instructions in VL can
656 /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
657 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
658
659 /// Vectorize a single entry in the tree.
660 Value *vectorizeTree(TreeEntry *E);
661
662 /// Vectorize a single entry in the tree, starting in \p VL.
663 Value *vectorizeTree(ArrayRef<Value *> VL);
664
665 /// \returns the scalarization cost for this type. Scalarization in this
666 /// context means the creation of vectors from a group of scalars.
667 int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);
668
669 /// \returns the scalarization cost for this list of values. Assuming that
670 /// this subtree gets vectorized, we may need to extract the values from the
671 /// roots. This method calculates the cost of extracting the values.
672 int getGatherCost(ArrayRef<Value *> VL);
673
674 /// \brief Set the Builder insert point to one after the last instruction in
675 /// the bundle
676 void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue);
677
678 /// \returns a vector from a collection of scalars in \p VL.
679 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
680
681 /// \returns whether the VectorizableTree is fully vectorizable and will
682 /// be beneficial even the tree height is tiny.
683 bool isFullyVectorizableTinyTree();
684
685 /// \reorder commutative operands in alt shuffle if they result in
686 /// vectorized code.
687 void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
688 SmallVectorImpl<Value *> &Left,
689 SmallVectorImpl<Value *> &Right);
690
691 /// \reorder commutative operands to get better probability of
692 /// generating vectorized code.
693 void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
694 SmallVectorImpl<Value *> &Left,
695 SmallVectorImpl<Value *> &Right);
696 struct TreeEntry {
697 TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
698
699 /// \returns true if the scalars in VL are equal to this entry.
700 bool isSame(ArrayRef<Value *> VL) const {
701 if (VL.size() == Scalars.size())
702 return std::equal(VL.begin(), VL.end(), Scalars.begin());
703 return VL.size() == ReuseShuffleIndices.size() &&
704 std::equal(
705 VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
706 [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
707 }
708
709 /// A vector of scalars.
710 ValueList Scalars;
711
712 /// The Scalars are vectorized into this value. It is initialized to Null.
713 Value *VectorizedValue = nullptr;
714
715 /// Do we need to gather this sequence ?
716 bool NeedToGather = false;
717
718 /// Does this sequence require some shuffling?
719 SmallVector<unsigned, 4> ReuseShuffleIndices;
720
721 /// Points back to the VectorizableTree.
722 ///
723 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
724 /// to be a pointer and needs to be able to initialize the child iterator.
725 /// Thus we need a reference back to the container to translate the indices
726 /// to entries.
727 std::vector<TreeEntry> &Container;
728
729 /// The TreeEntry index containing the user of this entry. We can actually
730 /// have multiple users so the data structure is not truly a tree.
731 SmallVector<int, 1> UserTreeIndices;
732 };
733
734 /// Create a new VectorizableTree entry.
735 void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
736 ArrayRef<unsigned> ReuseShuffleIndices = None) {
737 VectorizableTree.emplace_back(VectorizableTree);
738 int idx = VectorizableTree.size() - 1;
739 TreeEntry *Last = &VectorizableTree[idx];
740 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
741 Last->NeedToGather = !Vectorized;
742 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
743 ReuseShuffleIndices.end());
744 if (Vectorized) {
745 for (int i = 0, e = VL.size(); i != e; ++i) {
746 assert(!getTreeEntry(VL[i]) && "Scalar already in tree!")(static_cast <bool> (!getTreeEntry(VL[i]) && "Scalar already in tree!"
) ? void (0) : __assert_fail ("!getTreeEntry(VL[i]) && \"Scalar already in tree!\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 746, __extension__ __PRETTY_FUNCTION__))
;
747 ScalarToTreeEntry[VL[i]] = idx;
748 }
749 } else {
750 MustGather.insert(VL.begin(), VL.end());
751 }
752
753 if (UserTreeIdx >= 0)
754 Last->UserTreeIndices.push_back(UserTreeIdx);
755 UserTreeIdx = idx;
756 }
757
758 /// -- Vectorization State --
759 /// Holds all of the tree entries.
760 std::vector<TreeEntry> VectorizableTree;
761
762 TreeEntry *getTreeEntry(Value *V) {
763 auto I = ScalarToTreeEntry.find(V);
764 if (I != ScalarToTreeEntry.end())
765 return &VectorizableTree[I->second];
766 return nullptr;
767 }
768
769 /// Maps a specific scalar to its tree entry.
770 SmallDenseMap<Value*, int> ScalarToTreeEntry;
771
772 /// A list of scalars that we found that we need to keep as scalars.
773 ValueSet MustGather;
774
775 /// This POD struct describes one external user in the vectorized tree.
776 struct ExternalUser {
777 ExternalUser(Value *S, llvm::User *U, int L)
778 : Scalar(S), User(U), Lane(L) {}
779
780 // Which scalar in our function.
781 Value *Scalar;
782
783 // Which user that uses the scalar.
784 llvm::User *User;
785
786 // Which lane does the scalar belong to.
787 int Lane;
788 };
789 using UserList = SmallVector<ExternalUser, 16>;
790
791 /// Checks if two instructions may access the same memory.
792 ///
793 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
794 /// is invariant in the calling loop.
795 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
796 Instruction *Inst2) {
797 // First check if the result is already in the cache.
798 AliasCacheKey key = std::make_pair(Inst1, Inst2);
799 Optional<bool> &result = AliasCache[key];
800 if (result.hasValue()) {
801 return result.getValue();
802 }
803 MemoryLocation Loc2 = getLocation(Inst2, AA);
804 bool aliased = true;
805 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
806 // Do the alias check.
807 aliased = AA->alias(Loc1, Loc2);
808 }
809 // Store the result in the cache.
810 result = aliased;
811 return aliased;
812 }
813
814 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
815
816 /// Cache for alias results.
817 /// TODO: consider moving this to the AliasAnalysis itself.
818 DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
819
820 /// Removes an instruction from its block and eventually deletes it.
821 /// It's like Instruction::eraseFromParent() except that the actual deletion
822 /// is delayed until BoUpSLP is destructed.
823 /// This is required to ensure that there are no incorrect collisions in the
824 /// AliasCache, which can happen if a new instruction is allocated at the
825 /// same address as a previously deleted instruction.
826 void eraseInstruction(Instruction *I) {
827 I->removeFromParent();
828 I->dropAllReferences();
829 DeletedInstructions.emplace_back(I);
830 }
831
832 /// Temporary store for deleted instructions. Instructions will be deleted
833 /// eventually when the BoUpSLP is destructed.
834 SmallVector<unique_value, 8> DeletedInstructions;
835
836 /// A list of values that need to extracted out of the tree.
837 /// This list holds pairs of (Internal Scalar : External User). External User
838 /// can be nullptr, it means that this Internal Scalar will be used later,
839 /// after vectorization.
840 UserList ExternalUses;
841
842 /// Values used only by @llvm.assume calls.
843 SmallPtrSet<const Value *, 32> EphValues;
844
845 /// Holds all of the instructions that we gathered.
846 SetVector<Instruction *> GatherSeq;
847
848 /// A list of blocks that we are going to CSE.
849 SetVector<BasicBlock *> CSEBlocks;
850
851 /// Contains all scheduling relevant data for an instruction.
852 /// A ScheduleData either represents a single instruction or a member of an
853 /// instruction bundle (= a group of instructions which is combined into a
854 /// vector instruction).
855 struct ScheduleData {
856 // The initial value for the dependency counters. It means that the
857 // dependencies are not calculated yet.
858 enum { InvalidDeps = -1 };
859
860 ScheduleData() = default;
861
862 void init(int BlockSchedulingRegionID, Value *OpVal) {
863 FirstInBundle = this;
864 NextInBundle = nullptr;
865 NextLoadStore = nullptr;
866 IsScheduled = false;
867 SchedulingRegionID = BlockSchedulingRegionID;
868 UnscheduledDepsInBundle = UnscheduledDeps;
869 clearDependencies();
870 OpValue = OpVal;
871 }
872
873 /// Returns true if the dependency information has been calculated.
874 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
875
876 /// Returns true for single instructions and for bundle representatives
877 /// (= the head of a bundle).
878 bool isSchedulingEntity() const { return FirstInBundle == this; }
879
880 /// Returns true if it represents an instruction bundle and not only a
881 /// single instruction.
882 bool isPartOfBundle() const {
883 return NextInBundle != nullptr || FirstInBundle != this;
884 }
885
886 /// Returns true if it is ready for scheduling, i.e. it has no more
887 /// unscheduled depending instructions/bundles.
888 bool isReady() const {
889 assert(isSchedulingEntity() &&(static_cast <bool> (isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 890, __extension__ __PRETTY_FUNCTION__))
890 "can't consider non-scheduling entity for ready list")(static_cast <bool> (isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 890, __extension__ __PRETTY_FUNCTION__))
;
891 return UnscheduledDepsInBundle == 0 && !IsScheduled;
892 }
893
894 /// Modifies the number of unscheduled dependencies, also updating it for
895 /// the whole bundle.
896 int incrementUnscheduledDeps(int Incr) {
897 UnscheduledDeps += Incr;
898 return FirstInBundle->UnscheduledDepsInBundle += Incr;
899 }
900
901 /// Sets the number of unscheduled dependencies to the number of
902 /// dependencies.
903 void resetUnscheduledDeps() {
904 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
905 }
906
907 /// Clears all dependency information.
908 void clearDependencies() {
909 Dependencies = InvalidDeps;
910 resetUnscheduledDeps();
911 MemoryDependencies.clear();
912 }
913
914 void dump(raw_ostream &os) const {
915 if (!isSchedulingEntity()) {
916 os << "/ " << *Inst;
917 } else if (NextInBundle) {
918 os << '[' << *Inst;
919 ScheduleData *SD = NextInBundle;
920 while (SD) {
921 os << ';' << *SD->Inst;
922 SD = SD->NextInBundle;
923 }
924 os << ']';
925 } else {
926 os << *Inst;
927 }
928 }
929
930 Instruction *Inst = nullptr;
931
932 /// Points to the head in an instruction bundle (and always to this for
933 /// single instructions).
934 ScheduleData *FirstInBundle = nullptr;
935
936 /// Single linked list of all instructions in a bundle. Null if it is a
937 /// single instruction.
938 ScheduleData *NextInBundle = nullptr;
939
940 /// Single linked list of all memory instructions (e.g. load, store, call)
941 /// in the block - until the end of the scheduling region.
942 ScheduleData *NextLoadStore = nullptr;
943
944 /// The dependent memory instructions.
945 /// This list is derived on demand in calculateDependencies().
946 SmallVector<ScheduleData *, 4> MemoryDependencies;
947
948 /// This ScheduleData is in the current scheduling region if this matches
949 /// the current SchedulingRegionID of BlockScheduling.
950 int SchedulingRegionID = 0;
951
952 /// Used for getting a "good" final ordering of instructions.
953 int SchedulingPriority = 0;
954
955 /// The number of dependencies. Constitutes of the number of users of the
956 /// instruction plus the number of dependent memory instructions (if any).
957 /// This value is calculated on demand.
958 /// If InvalidDeps, the number of dependencies is not calculated yet.
959 int Dependencies = InvalidDeps;
960
961 /// The number of dependencies minus the number of dependencies of scheduled
962 /// instructions. As soon as this is zero, the instruction/bundle gets ready
963 /// for scheduling.
964 /// Note that this is negative as long as Dependencies is not calculated.
965 int UnscheduledDeps = InvalidDeps;
966
967 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
968 /// single instructions.
969 int UnscheduledDepsInBundle = InvalidDeps;
970
971 /// True if this instruction is scheduled (or considered as scheduled in the
972 /// dry-run).
973 bool IsScheduled = false;
974
975 /// Opcode of the current instruction in the schedule data.
976 Value *OpValue = nullptr;
977 };
978
979#ifndef NDEBUG
980 friend inline raw_ostream &operator<<(raw_ostream &os,
981 const BoUpSLP::ScheduleData &SD) {
982 SD.dump(os);
983 return os;
984 }
985#endif
986
987 friend struct GraphTraits<BoUpSLP *>;
988 friend struct DOTGraphTraits<BoUpSLP *>;
989
990 /// Contains all scheduling data for a basic block.
991 struct BlockScheduling {
992 BlockScheduling(BasicBlock *BB)
993 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
994
995 void clear() {
996 ReadyInsts.clear();
997 ScheduleStart = nullptr;
998 ScheduleEnd = nullptr;
999 FirstLoadStoreInRegion = nullptr;
1000 LastLoadStoreInRegion = nullptr;
1001
1002 // Reduce the maximum schedule region size by the size of the
1003 // previous scheduling run.
1004 ScheduleRegionSizeLimit -= ScheduleRegionSize;
1005 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
1006 ScheduleRegionSizeLimit = MinScheduleRegionSize;
1007 ScheduleRegionSize = 0;
1008
1009 // Make a new scheduling region, i.e. all existing ScheduleData is not
1010 // in the new region yet.
1011 ++SchedulingRegionID;
1012 }
1013
1014 ScheduleData *getScheduleData(Value *V) {
1015 ScheduleData *SD = ScheduleDataMap[V];
1016 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
1017 return SD;
1018 return nullptr;
1019 }
1020
1021 ScheduleData *getScheduleData(Value *V, Value *Key) {
1022 if (V == Key)
1023 return getScheduleData(V);
1024 auto I = ExtraScheduleDataMap.find(V);
1025 if (I != ExtraScheduleDataMap.end()) {
1026 ScheduleData *SD = I->second[Key];
1027 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
1028 return SD;
1029 }
1030 return nullptr;
1031 }
1032
1033 bool isInSchedulingRegion(ScheduleData *SD) {
1034 return SD->SchedulingRegionID == SchedulingRegionID;
1035 }
1036
1037 /// Marks an instruction as scheduled and puts all dependent ready
1038 /// instructions into the ready-list.
1039 template <typename ReadyListType>
1040 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
1041 SD->IsScheduled = true;
1042 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule " << *SD <<
"\n"; } } while (false)
;
1043
1044 ScheduleData *BundleMember = SD;
1045 while (BundleMember) {
1046 if (BundleMember->Inst != BundleMember->OpValue) {
1047 BundleMember = BundleMember->NextInBundle;
1048 continue;
1049 }
1050 // Handle the def-use chain dependencies.
1051 for (Use &U : BundleMember->Inst->operands()) {
1052 auto *I = dyn_cast<Instruction>(U.get());
1053 if (!I)
1054 continue;
1055 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
1056 if (OpDef && OpDef->hasValidDependencies() &&
1057 OpDef->incrementUnscheduledDeps(-1) == 0) {
1058 // There are no more unscheduled dependencies after
1059 // decrementing, so we can put the dependent instruction
1060 // into the ready list.
1061 ScheduleData *DepBundle = OpDef->FirstInBundle;
1062 assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1063, __extension__ __PRETTY_FUNCTION__))
1063 "already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1063, __extension__ __PRETTY_FUNCTION__))
;
1064 ReadyList.insert(DepBundle);
1065 DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)
1066 << "SLP: gets ready (def): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)
;
1067 }
1068 });
1069 }
1070 // Handle the memory dependencies.
1071 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
1072 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
1073 // There are no more unscheduled dependencies after decrementing,
1074 // so we can put the dependent instruction into the ready list.
1075 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
1076 assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1077, __extension__ __PRETTY_FUNCTION__))
1077 "already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1077, __extension__ __PRETTY_FUNCTION__))
;
1078 ReadyList.insert(DepBundle);
1079 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundledo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)
1080 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)
;
1081 }
1082 }
1083 BundleMember = BundleMember->NextInBundle;
1084 }
1085 }
1086
1087 void doForAllOpcodes(Value *V,
1088 function_ref<void(ScheduleData *SD)> Action) {
1089 if (ScheduleData *SD = getScheduleData(V))
1090 Action(SD);
1091 auto I = ExtraScheduleDataMap.find(V);
1092 if (I != ExtraScheduleDataMap.end())
1093 for (auto &P : I->second)
1094 if (P.second->SchedulingRegionID == SchedulingRegionID)
1095 Action(P.second);
1096 }
1097
1098 /// Put all instructions into the ReadyList which are ready for scheduling.
1099 template <typename ReadyListType>
1100 void initialFillReadyList(ReadyListType &ReadyList) {
1101 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
1102 doForAllOpcodes(I, [&](ScheduleData *SD) {
1103 if (SD->isSchedulingEntity() && SD->isReady()) {
1104 ReadyList.insert(SD);
1105 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *I << "\n"; } } while (false)
;
1106 }
1107 });
1108 }
1109 }
1110
1111 /// Checks if a bundle of instructions can be scheduled, i.e. has no
1112 /// cyclic dependencies. This is only a dry-run, no instructions are
1113 /// actually moved at this stage.
1114 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
1115
1116 /// Un-bundles a group of instructions.
1117 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
1118
1119 /// Allocates schedule data chunk.
1120 ScheduleData *allocateScheduleDataChunks();
1121
1122 /// Extends the scheduling region so that V is inside the region.
1123 /// \returns true if the region size is within the limit.
1124 bool extendSchedulingRegion(Value *V, Value *OpValue);
1125
1126 /// Initialize the ScheduleData structures for new instructions in the
1127 /// scheduling region.
1128 void initScheduleData(Instruction *FromI, Instruction *ToI,
1129 ScheduleData *PrevLoadStore,
1130 ScheduleData *NextLoadStore);
1131
1132 /// Updates the dependency information of a bundle and of all instructions/
1133 /// bundles which depend on the original bundle.
1134 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
1135 BoUpSLP *SLP);
1136
1137 /// Sets all instruction in the scheduling region to un-scheduled.
1138 void resetSchedule();
1139
1140 BasicBlock *BB;
1141
1142 /// Simple memory allocation for ScheduleData.
1143 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
1144
1145 /// The size of a ScheduleData array in ScheduleDataChunks.
1146 int ChunkSize;
1147
1148 /// The allocator position in the current chunk, which is the last entry
1149 /// of ScheduleDataChunks.
1150 int ChunkPos;
1151
1152 /// Attaches ScheduleData to Instruction.
1153 /// Note that the mapping survives during all vectorization iterations, i.e.
1154 /// ScheduleData structures are recycled.
1155 DenseMap<Value *, ScheduleData *> ScheduleDataMap;
1156
1157 /// Attaches ScheduleData to Instruction with the leading key.
1158 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
1159 ExtraScheduleDataMap;
1160
1161 struct ReadyList : SmallVector<ScheduleData *, 8> {
1162 void insert(ScheduleData *SD) { push_back(SD); }
1163 };
1164
1165 /// The ready-list for scheduling (only used for the dry-run).
1166 ReadyList ReadyInsts;
1167
1168 /// The first instruction of the scheduling region.
1169 Instruction *ScheduleStart = nullptr;
1170
1171 /// The first instruction _after_ the scheduling region.
1172 Instruction *ScheduleEnd = nullptr;
1173
1174 /// The first memory accessing instruction in the scheduling region
1175 /// (can be null).
1176 ScheduleData *FirstLoadStoreInRegion = nullptr;
1177
1178 /// The last memory accessing instruction in the scheduling region
1179 /// (can be null).
1180 ScheduleData *LastLoadStoreInRegion = nullptr;
1181
1182 /// The current size of the scheduling region.
1183 int ScheduleRegionSize = 0;
1184
1185 /// The maximum size allowed for the scheduling region.
1186 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
1187
1188 /// The ID of the scheduling region. For a new vectorization iteration this
1189 /// is incremented which "removes" all ScheduleData from the region.
1190 // Make sure that the initial SchedulingRegionID is greater than the
1191 // initial SchedulingRegionID in ScheduleData (which is 0).
1192 int SchedulingRegionID = 1;
1193 };
1194
1195 /// Attaches the BlockScheduling structures to basic blocks.
1196 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
1197
1198 /// Performs the "real" scheduling. Done before vectorization is actually
1199 /// performed in a basic block.
1200 void scheduleBlock(BlockScheduling *BS);
1201
1202 /// List of users to ignore during scheduling and that don't need extracting.
1203 ArrayRef<Value *> UserIgnoreList;
1204
1205 /// Number of operation bundles that contain consecutive operations - number
1206 /// of operation bundles that contain consecutive operations in reversed
1207 /// order.
1208 DenseMap<unsigned, int> NumOpsWantToKeepOrder;
1209
1210 // Analysis and block reference.
1211 Function *F;
1212 ScalarEvolution *SE;
1213 TargetTransformInfo *TTI;
1214 TargetLibraryInfo *TLI;
1215 AliasAnalysis *AA;
1216 LoopInfo *LI;
1217 DominatorTree *DT;
1218 AssumptionCache *AC;
1219 DemandedBits *DB;
1220 const DataLayout *DL;
1221 OptimizationRemarkEmitter *ORE;
1222
1223 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
1224 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
1225
1226 /// Instruction builder to construct the vectorized tree.
1227 IRBuilder<> Builder;
1228
1229 /// A map of scalar integer values to the smallest bit width with which they
1230 /// can legally be represented. The values map to (width, signed) pairs,
1231 /// where "width" indicates the minimum bit width and "signed" is True if the
1232 /// value must be signed-extended, rather than zero-extended, back to its
1233 /// original width.
1234 MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
1235};
1236
1237} // end namespace slpvectorizer
1238
1239template <> struct GraphTraits<BoUpSLP *> {
1240 using TreeEntry = BoUpSLP::TreeEntry;
1241
1242 /// NodeRef has to be a pointer per the GraphWriter.
1243 using NodeRef = TreeEntry *;
1244
1245 /// \brief Add the VectorizableTree to the index iterator to be able to return
1246 /// TreeEntry pointers.
1247 struct ChildIteratorType
1248 : public iterator_adaptor_base<ChildIteratorType,
1249 SmallVector<int, 1>::iterator> {
1250 std::vector<TreeEntry> &VectorizableTree;
1251
1252 ChildIteratorType(SmallVector<int, 1>::iterator W,
1253 std::vector<TreeEntry> &VT)
1254 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
1255
1256 NodeRef operator*() { return &VectorizableTree[*I]; }
1257 };
1258
1259 static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
1260
1261 static ChildIteratorType child_begin(NodeRef N) {
1262 return {N->UserTreeIndices.begin(), N->Container};
1263 }
1264
1265 static ChildIteratorType child_end(NodeRef N) {
1266 return {N->UserTreeIndices.end(), N->Container};
1267 }
1268
1269 /// For the node iterator we just need to turn the TreeEntry iterator into a
1270 /// TreeEntry* iterator so that it dereferences to NodeRef.
1271 using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;
1272
1273 static nodes_iterator nodes_begin(BoUpSLP *R) {
1274 return nodes_iterator(R->VectorizableTree.begin());
1275 }
1276
1277 static nodes_iterator nodes_end(BoUpSLP *R) {
1278 return nodes_iterator(R->VectorizableTree.end());
1279 }
1280
1281 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
1282};
1283
1284template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
1285 using TreeEntry = BoUpSLP::TreeEntry;
1286
1287 DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
1288
1289 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
1290 std::string Str;
1291 raw_string_ostream OS(Str);
1292 if (isSplat(Entry->Scalars)) {
1293 OS << "<splat> " << *Entry->Scalars[0];
1294 return Str;
1295 }
1296 for (auto V : Entry->Scalars) {
1297 OS << *V;
1298 if (std::any_of(
1299 R->ExternalUses.begin(), R->ExternalUses.end(),
1300 [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
1301 OS << " <extract>";
1302 OS << "\n";
1303 }
1304 return Str;
1305 }
1306
1307 static std::string getNodeAttributes(const TreeEntry *Entry,
1308 const BoUpSLP *) {
1309 if (Entry->NeedToGather)
1310 return "color=red";
1311 return "";
1312 }
1313};
1314
1315} // end namespace llvm
1316
1317void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
1318 ArrayRef<Value *> UserIgnoreLst) {
1319 ExtraValueToDebugLocsMap ExternallyUsedValues;
1320 buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
1321}
1322
1323void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
1324 ExtraValueToDebugLocsMap &ExternallyUsedValues,
1325 ArrayRef<Value *> UserIgnoreLst) {
1326 deleteTree();
1327 UserIgnoreList = UserIgnoreLst;
1328 if (!allSameType(Roots))
1329 return;
1330 buildTree_rec(Roots, 0, -1);
1331
1332 // Collect the values that we need to extract from the tree.
1333 for (TreeEntry &EIdx : VectorizableTree) {
1334 TreeEntry *Entry = &EIdx;
1335
1336 // No need to handle users of gathered values.
1337 if (Entry->NeedToGather)
1338 continue;
1339
1340 // For each lane:
1341 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
1342 Value *Scalar = Entry->Scalars[Lane];
1343 int FoundLane = Lane;
1344 if (!Entry->ReuseShuffleIndices.empty()) {
1345 FoundLane =
1346 std::distance(Entry->ReuseShuffleIndices.begin(),
1347 llvm::find(Entry->ReuseShuffleIndices, FoundLane));
1348 }
1349
1350 // Check if the scalar is externally used as an extra arg.
1351 auto ExtI = ExternallyUsedValues.find(Scalar);
1352 if (ExtI != ExternallyUsedValues.end()) {
1353 DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)
1354 Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)
;
1355 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
1356 }
1357 for (User *U : Scalar->users()) {
1358 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Checking user:" << *U <<
".\n"; } } while (false)
;
1359
1360 Instruction *UserInst = dyn_cast<Instruction>(U);
1361 if (!UserInst)
1362 continue;
1363
1364 // Skip in-tree scalars that become vectors
1365 if (TreeEntry *UseEntry = getTreeEntry(U)) {
1366 Value *UseScalar = UseEntry->Scalars[0];
1367 // Some in-tree scalars will remain as scalar in vectorized
1368 // instructions. If that is the case, the one in Lane 0 will
1369 // be used.
1370 if (UseScalar != U ||
1371 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
1372 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *Udo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)
1373 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)
;
1374 assert(!UseEntry->NeedToGather && "Bad state")(static_cast <bool> (!UseEntry->NeedToGather &&
"Bad state") ? void (0) : __assert_fail ("!UseEntry->NeedToGather && \"Bad state\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1374, __extension__ __PRETTY_FUNCTION__))
;
1375 continue;
1376 }
1377 }
1378
1379 // Ignore users in the user ignore list.
1380 if (is_contained(UserIgnoreList, UserInst))
1381 continue;
1382
1383 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)
1384 Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)
;
1385 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
1386 }
1387 }
1388 }
1389}
1390
1391void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
1392 int UserTreeIdx) {
1393 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!")(static_cast <bool> ((allConstant(VL) || allSameType(VL
)) && "Invalid types!") ? void (0) : __assert_fail ("(allConstant(VL) || allSameType(VL)) && \"Invalid types!\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1393, __extension__ __PRETTY_FUNCTION__))
;
1394
1395 InstructionsState S = getSameOpcode(VL);
1396 if (Depth == RecursionMaxDepth) {
1397 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to max recursion depth.\n"
; } } while (false)
;
1398 newTreeEntry(VL, false, UserTreeIdx);
1399 return;
1400 }
1401
1402 // Don't handle vectors.
1403 if (S.OpValue->getType()->isVectorTy()) {
1404 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to vector type.\n"
; } } while (false)
;
1405 newTreeEntry(VL, false, UserTreeIdx);
1406 return;
1407 }
1408
1409 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
1410 if (SI->getValueOperand()->getType()->isVectorTy()) {
1411 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to store vector type.\n"
; } } while (false)
;
1412 newTreeEntry(VL, false, UserTreeIdx);
1413 return;
1414 }
1415
1416 // If all of the operands are identical or constant we have a simple solution.
1417 if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
1418 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to C,S,B,O. \n"
; } } while (false)
;
1419 newTreeEntry(VL, false, UserTreeIdx);
1420 return;
1421 }
1422
1423 // We now know that this is a vector of instructions of the same type from
1424 // the same block.
1425
1426 // Don't vectorize ephemeral values.
1427 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1428 if (EphValues.count(VL[i])) {
1429 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is ephemeral.\n"; } } while (false)
1430 ") is ephemeral.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is ephemeral.\n"; } } while (false)
;
1431 newTreeEntry(VL, false, UserTreeIdx);
1432 return;
1433 }
1434 }
1435
1436 // Check if this is a duplicate of another entry.
1437 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
1438 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tChecking bundle: " <<
*S.OpValue << ".\n"; } } while (false)
;
1439 if (!E->isSame(VL)) {
1440 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to partial overlap.\n"
; } } while (false)
;
1441 newTreeEntry(VL, false, UserTreeIdx);
1442 return;
1443 }
1444 // Record the reuse of the tree node. FIXME, currently this is only used to
1445 // properly draw the graph rather than for the actual vectorization.
1446 E->UserTreeIndices.push_back(UserTreeIdx);
1447 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*S.OpValue << ".\n"; } } while (false)
;
1448 return;
1449 }
1450
1451 // Check that none of the instructions in the bundle are already in the tree.
1452 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1453 auto *I = dyn_cast<Instruction>(VL[i]);
1454 if (!I)
1455 continue;
1456 if (getTreeEntry(I)) {
1457 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is already in tree.\n"; } } while (false)
1458 ") is already in tree.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is already in tree.\n"; } } while (false)
;
1459 newTreeEntry(VL, false, UserTreeIdx);
1460 return;
1461 }
1462 }
1463
1464 // If any of the scalars is marked as a value that needs to stay scalar, then
1465 // we need to gather the scalars.
1466 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1467 if (MustGather.count(VL[i])) {
1468 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to gathered scalar.\n"
; } } while (false)
;
1469 newTreeEntry(VL, false, UserTreeIdx);
1470 return;
1471 }
1472 }
1473
1474 // Check that all of the users of the scalars that we want to vectorize are
1475 // schedulable.
1476 auto *VL0 = cast<Instruction>(S.OpValue);
1477 BasicBlock *BB = VL0->getParent();
1478
1479 if (!DT->isReachableFromEntry(BB)) {
1480 // Don't go into unreachable blocks. They may contain instructions with
1481 // dependency cycles which confuse the final scheduling.
1482 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle in unreachable block.\n"
; } } while (false)
;
1483 newTreeEntry(VL, false, UserTreeIdx);
1484 return;
1485 }
1486
1487 // Check that every instruction appears once in this bundle.
1488 SmallVector<unsigned, 4> ReuseShuffleIndicies;
1489 SmallVector<Value *, 4> UniqueValues;
1490 DenseMap<Value *, unsigned> UniquePositions;
1491 for (Value *V : VL) {
1492 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
1493 ReuseShuffleIndicies.emplace_back(Res.first->second);
1494 if (Res.second)
1495 UniqueValues.emplace_back(V);
1496 }
1497 if (UniqueValues.size() == VL.size()) {
1498 ReuseShuffleIndicies.clear();
1499 } else {
1500 DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Shuffle for reused scalars.\n"
; } } while (false)
;
1501 if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
1502 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Scalar used twice in bundle.\n"
; } } while (false)
;
1503 newTreeEntry(VL, false, UserTreeIdx);
1504 return;
1505 }
1506 VL = UniqueValues;
1507 }
1508
1509 auto &BSRef = BlocksSchedules[BB];
1510 if (!BSRef)
1511 BSRef = llvm::make_unique<BlockScheduling>(BB);
1512
1513 BlockScheduling &BS = *BSRef.get();
1514
1515 if (!BS.tryScheduleBundle(VL, this, VL0)) {
1516 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are not able to schedule this bundle!\n"
; } } while (false)
;
1517 assert((!BS.getScheduleData(VL0) ||(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1519, __extension__ __PRETTY_FUNCTION__))
1518 !BS.getScheduleData(VL0)->isPartOfBundle()) &&(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1519, __extension__ __PRETTY_FUNCTION__))
1519 "tryScheduleBundle should cancelScheduling on failure")(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1519, __extension__ __PRETTY_FUNCTION__))
;
1520 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1521 return;
1522 }
1523 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are able to schedule this bundle.\n"
; } } while (false)
;
1524
1525 unsigned ShuffleOrOp = S.IsAltShuffle ?
1526 (unsigned) Instruction::ShuffleVector : S.Opcode;
1527 switch (ShuffleOrOp) {
1528 case Instruction::PHI: {
1529 PHINode *PH = dyn_cast<PHINode>(VL0);
1530
1531 // Check for terminator values (e.g. invoke).
1532 for (unsigned j = 0; j < VL.size(); ++j)
1533 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1534 TerminatorInst *Term = dyn_cast<TerminatorInst>(
1535 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1536 if (Term) {
1537 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"
; } } while (false)
;
1538 BS.cancelScheduling(VL, VL0);
1539 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1540 return;
1541 }
1542 }
1543
1544 newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
1545 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of PHINodes.\n"
; } } while (false)
;
1546
1547 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1548 ValueList Operands;
1549 // Prepare the operand vector.
1550 for (Value *j : VL)
1551 Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
1552 PH->getIncomingBlock(i)));
1553
1554 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1555 }
1556 return;
1557 }
1558 case Instruction::ExtractValue:
1559 case Instruction::ExtractElement: {
1560 bool Reuse = canReuseExtract(VL, VL0);
1561 if (Reuse) {
1562 DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Reusing or shuffling extract sequence.\n"
; } } while (false)
;
1563 ++NumOpsWantToKeepOrder[S.Opcode];
1564 } else {
1565 SmallVector<Value *, 4> ReverseVL(VL.rbegin(), VL.rend());
1566 if (canReuseExtract(ReverseVL, VL0))
1567 --NumOpsWantToKeepOrder[S.Opcode];
1568 BS.cancelScheduling(VL, VL0);
1569 }
1570 newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies);
1571 return;
1572 }
1573 case Instruction::Load: {
1574 // Check that a vectorized load would load the same memory as a scalar
1575 // load. For example, we don't want to vectorize loads that are smaller
1576 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
1577 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
1578 // from such a struct, we read/write packed bits disagreeing with the
1579 // unvectorized version.
1580 Type *ScalarTy = VL0->getType();
1581
1582 if (DL->getTypeSizeInBits(ScalarTy) !=
1583 DL->getTypeAllocSizeInBits(ScalarTy)) {
1584 BS.cancelScheduling(VL, VL0);
1585 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1586 DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering loads of non-packed type.\n"
; } } while (false)
;
1587 return;
1588 }
1589
1590 // Make sure all loads in the bundle are simple - we can't vectorize
1591 // atomic or volatile loads.
1592 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1593 LoadInst *L = cast<LoadInst>(VL[i]);
1594 if (!L->isSimple()) {
1595 BS.cancelScheduling(VL, VL0);
1596 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1597 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple loads.\n"
; } } while (false)
;
1598 return;
1599 }
1600 }
1601
1602 // Check if the loads are consecutive, reversed, or neither.
1603 // TODO: What we really want is to sort the loads, but for now, check
1604 // the two likely directions.
1605 bool Consecutive = true;
1606 bool ReverseConsecutive = true;
1607 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1608 if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
1609 Consecutive = false;
1610 break;
1611 } else {
1612 ReverseConsecutive = false;
1613 }
1614 }
1615
1616 if (Consecutive) {
1617 ++NumOpsWantToKeepOrder[S.Opcode];
1618 newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
1619 DEBUG(dbgs() << "SLP: added a vector of loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of loads.\n";
} } while (false)
;
1620 return;
1621 }
1622
1623 // If none of the load pairs were consecutive when checked in order,
1624 // check the reverse order.
1625 if (ReverseConsecutive)
1626 for (unsigned i = VL.size() - 1; i > 0; --i)
1627 if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
1628 ReverseConsecutive = false;
1629 break;
1630 }
1631
1632 if (ReverseConsecutive) {
1633 --NumOpsWantToKeepOrder[S.Opcode];
1634 newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
1635 DEBUG(dbgs() << "SLP: added a vector of reversed loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of reversed loads.\n"
; } } while (false)
;
1636 return;
1637 }
1638
1639 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-consecutive loads.\n"
; } } while (false)
;
1640 BS.cancelScheduling(VL, VL0);
1641 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1642 return;
1643 }
1644 case Instruction::ZExt:
1645 case Instruction::SExt:
1646 case Instruction::FPToUI:
1647 case Instruction::FPToSI:
1648 case Instruction::FPExt:
1649 case Instruction::PtrToInt:
1650 case Instruction::IntToPtr:
1651 case Instruction::SIToFP:
1652 case Instruction::UIToFP:
1653 case Instruction::Trunc:
1654 case Instruction::FPTrunc:
1655 case Instruction::BitCast: {
1656 Type *SrcTy = VL0->getOperand(0)->getType();
1657 for (unsigned i = 0; i < VL.size(); ++i) {
1658 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1659 if (Ty != SrcTy || !isValidElementType(Ty)) {
1660 BS.cancelScheduling(VL, VL0);
1661 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1662 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false)
;
1663 return;
1664 }
1665 }
1666 newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
1667 DEBUG(dbgs() << "SLP: added a vector of casts.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of casts.\n";
} } while (false)
;
1668
1669 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1670 ValueList Operands;
1671 // Prepare the operand vector.
1672 for (Value *j : VL)
1673 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1674
1675 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1676 }
1677 return;
1678 }
1679 case Instruction::ICmp:
1680 case Instruction::FCmp: {
1681 // Check that all of the compares have the same predicate.
1682 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
1683 Type *ComparedTy = VL0->getOperand(0)->getType();
1684 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1685 CmpInst *Cmp = cast<CmpInst>(VL[i]);
1686 if (Cmp->getPredicate() != P0 ||
1687 Cmp->getOperand(0)->getType() != ComparedTy) {
1688 BS.cancelScheduling(VL, VL0);
1689 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1690 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false)
;
1691 return;
1692 }
1693 }
1694
1695 newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
1696 DEBUG(dbgs() << "SLP: added a vector of compares.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of compares.\n"
; } } while (false)
;
1697
1698 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1699 ValueList Operands;
1700 // Prepare the operand vector.
1701 for (Value *j : VL)
1702 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1703
1704 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1705 }
1706 return;
1707 }
1708 case Instruction::Select:
1709 case Instruction::Add:
1710 case Instruction::FAdd:
1711 case Instruction::Sub:
1712 case Instruction::FSub:
1713 case Instruction::Mul:
1714 case Instruction::FMul:
1715 case Instruction::UDiv:
1716 case Instruction::SDiv:
1717 case Instruction::FDiv:
1718 case Instruction::URem:
1719 case Instruction::SRem:
1720 case Instruction::FRem:
1721 case Instruction::Shl:
1722 case Instruction::LShr:
1723 case Instruction::AShr:
1724 case Instruction::And:
1725 case Instruction::Or:
1726 case Instruction::Xor:
1727 newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
1728 DEBUG(dbgs() << "SLP: added a vector of bin op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of bin op.\n"
; } } while (false)
;
1729
1730 // Sort operands of the instructions so that each side is more likely to
1731 // have the same opcode.
1732 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1733 ValueList Left, Right;
1734 reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
1735 buildTree_rec(Left, Depth + 1, UserTreeIdx);
1736 buildTree_rec(Right, Depth + 1, UserTreeIdx);
1737 return;
1738 }
1739
1740 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1741 ValueList Operands;
1742 // Prepare the operand vector.
1743 for (Value *j : VL)
1744 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1745
1746 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1747 }
1748 return;
1749
1750 case Instruction::GetElementPtr: {
1751 // We don't combine GEPs with complicated (nested) indexing.
1752 for (unsigned j = 0; j < VL.size(); ++j) {
1753 if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1754 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"
; } } while (false)
;
1755 BS.cancelScheduling(VL, VL0);
1756 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1757 return;
1758 }
1759 }
1760
1761 // We can't combine several GEPs into one vector if they operate on
1762 // different types.
1763 Type *Ty0 = VL0->getOperand(0)->getType();
1764 for (unsigned j = 0; j < VL.size(); ++j) {
1765 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1766 if (Ty0 != CurTy) {
1767 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false)
;
1768 BS.cancelScheduling(VL, VL0);
1769 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1770 return;
1771 }
1772 }
1773
1774 // We don't combine GEPs with non-constant indexes.
1775 for (unsigned j = 0; j < VL.size(); ++j) {
1776 auto Op = cast<Instruction>(VL[j])->getOperand(1);
1777 if (!isa<ConstantInt>(Op)) {
1778 DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)
1779 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)
;
1780 BS.cancelScheduling(VL, VL0);
1781 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1782 return;
1783 }
1784 }
1785
1786 newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
1787 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of GEPs.\n"; }
} while (false)
;
1788 for (unsigned i = 0, e = 2; i < e; ++i) {
1789 ValueList Operands;
1790 // Prepare the operand vector.
1791 for (Value *j : VL)
1792 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1793
1794 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1795 }
1796 return;
1797 }
1798 case Instruction::Store: {
1799 // Check if the stores are consecutive or of we need to swizzle them.
1800 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1801 if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
1802 BS.cancelScheduling(VL, VL0);
1803 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1804 DEBUG(dbgs() << "SLP: Non-consecutive store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-consecutive store.\n"; }
} while (false)
;
1805 return;
1806 }
1807
1808 newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
1809 DEBUG(dbgs() << "SLP: added a vector of stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of stores.\n"
; } } while (false)
;
1810
1811 ValueList Operands;
1812 for (Value *j : VL)
1813 Operands.push_back(cast<Instruction>(j)->getOperand(0));
1814
1815 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1816 return;
1817 }
1818 case Instruction::Call: {
1819 // Check if the calls are all to the same vectorizable intrinsic.
1820 CallInst *CI = cast<CallInst>(VL0);
1821 // Check if this is an Intrinsic call or something that can be
1822 // represented by an intrinsic call
1823 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1824 if (!isTriviallyVectorizable(ID)) {
1825 BS.cancelScheduling(VL, VL0);
1826 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1827 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-vectorizable call.\n"; }
} while (false)
;
1828 return;
1829 }
1830 Function *Int = CI->getCalledFunction();
1831 Value *A1I = nullptr;
1832 if (hasVectorInstrinsicScalarOpd(ID, 1))
1833 A1I = CI->getArgOperand(1);
1834 for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1835 CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1836 if (!CI2 || CI2->getCalledFunction() != Int ||
1837 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
1838 !CI->hasIdenticalOperandBundleSchema(*CI2)) {
1839 BS.cancelScheduling(VL, VL0);
1840 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1841 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *VL[i] << "\n"; } } while (false
)
1842 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *VL[i] << "\n"; } } while (false
)
;
1843 return;
1844 }
1845 // ctlz,cttz and powi are special intrinsics whose second argument
1846 // should be same in order for them to be vectorized.
1847 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1848 Value *A1J = CI2->getArgOperand(1);
1849 if (A1I != A1J) {
1850 BS.cancelScheduling(VL, VL0);
1851 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1852 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument "<< A1I<<"!=" <<
A1J << "\n"; } } while (false)
1853 << " argument "<< A1I<<"!=" << A1Jdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument "<< A1I<<"!=" <<
A1J << "\n"; } } while (false)
1854 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument "<< A1I<<"!=" <<
A1J << "\n"; } } while (false)
;
1855 return;
1856 }
1857 }
1858 // Verify that the bundle operands are identical between the two calls.
1859 if (CI->hasOperandBundles() &&
1860 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
1861 CI->op_begin() + CI->getBundleOperandsEndIndex(),
1862 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
1863 BS.cancelScheduling(VL, VL0);
1864 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1865 DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *VL[i] << '\n'; } }
while (false)
1866 << *VL[i] << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *VL[i] << '\n'; } }
while (false)
;
1867 return;
1868 }
1869 }
1870
1871 newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
1872 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1873 ValueList Operands;
1874 // Prepare the operand vector.
1875 for (Value *j : VL) {
1876 CallInst *CI2 = dyn_cast<CallInst>(j);
1877 Operands.push_back(CI2->getArgOperand(i));
1878 }
1879 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1880 }
1881 return;
1882 }
1883 case Instruction::ShuffleVector:
1884 // If this is not an alternate sequence of opcode like add-sub
1885 // then do not vectorize this instruction.
1886 if (!S.IsAltShuffle) {
1887 BS.cancelScheduling(VL, VL0);
1888 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1889 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: ShuffleVector are not vectorized.\n"
; } } while (false)
;
1890 return;
1891 }
1892 newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
1893 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a ShuffleVector op.\n"
; } } while (false)
;
1894
1895 // Reorder operands if reordering would enable vectorization.
1896 if (isa<BinaryOperator>(VL0)) {
1897 ValueList Left, Right;
1898 reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
1899 buildTree_rec(Left, Depth + 1, UserTreeIdx);
1900 buildTree_rec(Right, Depth + 1, UserTreeIdx);
1901 return;
1902 }
1903
1904 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1905 ValueList Operands;
1906 // Prepare the operand vector.
1907 for (Value *j : VL)
1908 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1909
1910 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1911 }
1912 return;
1913
1914 default:
1915 BS.cancelScheduling(VL, VL0);
1916 newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
1917 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering unknown instruction.\n"
; } } while (false)
;
1918 return;
1919 }
1920}
1921
1922unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
1923 unsigned N;
1924 Type *EltTy;
1925 auto *ST = dyn_cast<StructType>(T);
1926 if (ST) {
1927 N = ST->getNumElements();
1928 EltTy = *ST->element_begin();
1929 } else {
1930 N = cast<ArrayType>(T)->getNumElements();
1931 EltTy = cast<ArrayType>(T)->getElementType();
1932 }
1933 if (!isValidElementType(EltTy))
1934 return 0;
1935 uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
1936 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
1937 return 0;
1938 if (ST) {
1939 // Check that struct is homogeneous.
1940 for (const auto *Ty : ST->elements())
1941 if (Ty != EltTy)
1942 return 0;
1943 }
1944 return N;
1945}
1946
1947bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
1948 Instruction *E0 = cast<Instruction>(OpValue);
1949 assert(E0->getOpcode() == Instruction::ExtractElement ||(static_cast <bool> (E0->getOpcode() == Instruction::
ExtractElement || E0->getOpcode() == Instruction::ExtractValue
) ? void (0) : __assert_fail ("E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1950, __extension__ __PRETTY_FUNCTION__))
1950 E0->getOpcode() == Instruction::ExtractValue)(static_cast <bool> (E0->getOpcode() == Instruction::
ExtractElement || E0->getOpcode() == Instruction::ExtractValue
) ? void (0) : __assert_fail ("E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1950, __extension__ __PRETTY_FUNCTION__))
;
1951 assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode")(static_cast <bool> (E0->getOpcode() == getSameOpcode
(VL).Opcode && "Invalid opcode") ? void (0) : __assert_fail
("E0->getOpcode() == getSameOpcode(VL).Opcode && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1951, __extension__ __PRETTY_FUNCTION__))
;
1952 // Check if all of the extracts come from the same vector and from the
1953 // correct offset.
1954 Value *Vec = E0->getOperand(0);
1955
1956 // We have to extract from a vector/aggregate with the same number of elements.
1957 unsigned NElts;
1958 if (E0->getOpcode() == Instruction::ExtractValue) {
1959 const DataLayout &DL = E0->getModule()->getDataLayout();
1960 NElts = canMapToVector(Vec->getType(), DL);
1961 if (!NElts)
1962 return false;
1963 // Check if load can be rewritten as load of vector.
1964 LoadInst *LI = dyn_cast<LoadInst>(Vec);
1965 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
1966 return false;
1967 } else {
1968 NElts = Vec->getType()->getVectorNumElements();
1969 }
1970
1971 if (NElts != VL.size())
1972 return false;
1973
1974 // Check that all of the indices extract from the correct offset.
1975 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
1976 Instruction *Inst = cast<Instruction>(VL[I]);
1977 if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
1978 return false;
1979 if (Inst->getOperand(0) != Vec)
1980 return false;
1981 }
1982
1983 return true;
1984}
1985
1986bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
1987 return I->hasOneUse() ||
1988 std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
1989 return ScalarToTreeEntry.count(U) > 0;
1990 });
1991}
1992
1993int BoUpSLP::getEntryCost(TreeEntry *E) {
1994 ArrayRef<Value*> VL = E->Scalars;
1995
1996 Type *ScalarTy = VL[0]->getType();
1997 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1998 ScalarTy = SI->getValueOperand()->getType();
1999 else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
2000 ScalarTy = CI->getOperand(0)->getType();
2001 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2002
2003 // If we have computed a smaller type for the expression, update VecTy so
2004 // that the costs will be accurate.
2005 if (MinBWs.count(VL[0]))
2006 VecTy = VectorType::get(
2007 IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
2008
2009 unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
2010 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
2011 int ReuseShuffleCost = 0;
2012 if (NeedToShuffleReuses) {
2013 ReuseShuffleCost =
2014 TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
2015 }
2016 if (E->NeedToGather) {
2017 if (allConstant(VL))
2018 return 0;
2019 if (isSplat(VL)) {
2020 return ReuseShuffleCost +
2021 TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
2022 }
2023 if (getSameOpcode(VL).Opcode == Instruction::ExtractElement &&
2024 allSameType(VL) && allSameBlock(VL)) {
2025 Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
2026 if (ShuffleKind.hasValue()) {
2027 int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
2028 for (auto *V : VL) {
2029 // If all users of instruction are going to be vectorized and this
2030 // instruction itself is not going to be vectorized, consider this
2031 // instruction as dead and remove its cost from the final cost of the
2032 // vectorized tree.
2033 if (areAllUsersVectorized(cast<Instruction>(V)) &&
2034 !ScalarToTreeEntry.count(V)) {
2035 auto *IO = cast<ConstantInt>(
2036 cast<ExtractElementInst>(V)->getIndexOperand());
2037 Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
2038 IO->getZExtValue());
2039 }
2040 }
2041 return ReuseShuffleCost + Cost;
2042 }
2043 }
2044 return ReuseShuffleCost + getGatherCost(VL);
2045 }
2046 InstructionsState S = getSameOpcode(VL);
2047 assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL")(static_cast <bool> (S.Opcode && allSameType(VL
) && allSameBlock(VL) && "Invalid VL") ? void
(0) : __assert_fail ("S.Opcode && allSameType(VL) && allSameBlock(VL) && \"Invalid VL\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2047, __extension__ __PRETTY_FUNCTION__))
;
2048 Instruction *VL0 = cast<Instruction>(S.OpValue);
2049 unsigned ShuffleOrOp = S.IsAltShuffle ?
2050 (unsigned) Instruction::ShuffleVector : S.Opcode;
2051 switch (ShuffleOrOp) {
2052 case Instruction::PHI:
2053 return 0;
2054
2055 case Instruction::ExtractValue:
2056 case Instruction::ExtractElement:
2057 if (NeedToShuffleReuses) {
2058 unsigned Idx = 0;
2059 for (unsigned I : E->ReuseShuffleIndices) {
2060 if (ShuffleOrOp == Instruction::ExtractElement) {
2061 auto *IO = cast<ConstantInt>(
2062 cast<ExtractElementInst>(VL[I])->getIndexOperand());
2063 Idx = IO->getZExtValue();
2064 ReuseShuffleCost -= TTI->getVectorInstrCost(
2065 Instruction::ExtractElement, VecTy, Idx);
2066 } else {
2067 ReuseShuffleCost -= TTI->getVectorInstrCost(
2068 Instruction::ExtractElement, VecTy, Idx);
2069 ++Idx;
2070 }
2071 }
2072 Idx = ReuseShuffleNumbers;
2073 for (Value *V : VL) {
2074 if (ShuffleOrOp == Instruction::ExtractElement) {
2075 auto *IO = cast<ConstantInt>(
2076 cast<ExtractElementInst>(V)->getIndexOperand());
2077 Idx = IO->getZExtValue();
2078 } else {
2079 --Idx;
2080 }
2081 ReuseShuffleCost +=
2082 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
2083 }
2084 }
2085 if (canReuseExtract(VL, S.OpValue)) {
2086 int DeadCost = ReuseShuffleCost;
2087 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
2088 Instruction *E = cast<Instruction>(VL[i]);
2089 // If all users are going to be vectorized, instruction can be
2090 // considered as dead.
2091 // The same, if have only one user, it will be vectorized for sure.
2092 if (areAllUsersVectorized(E))
2093 // Take credit for instruction that will become dead.
2094 DeadCost -=
2095 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
2096 }
2097 return DeadCost;
2098 }
2099 return ReuseShuffleCost + getGatherCost(VL);
2100
2101 case Instruction::ZExt:
2102 case Instruction::SExt:
2103 case Instruction::FPToUI:
2104 case Instruction::FPToSI:
2105 case Instruction::FPExt:
2106 case Instruction::PtrToInt:
2107 case Instruction::IntToPtr:
2108 case Instruction::SIToFP:
2109 case Instruction::UIToFP:
2110 case Instruction::Trunc:
2111 case Instruction::FPTrunc:
2112 case Instruction::BitCast: {
2113 Type *SrcTy = VL0->getOperand(0)->getType();
2114 if (NeedToShuffleReuses) {
2115 ReuseShuffleCost -=
2116 (ReuseShuffleNumbers - VL.size()) *
2117 TTI->getCastInstrCost(S.Opcode, ScalarTy, SrcTy, VL0);
2118 }
2119
2120 // Calculate the cost of this instruction.
2121 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
2122 VL0->getType(), SrcTy, VL0);
2123
2124 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
2125 int VecCost = 0;
2126 // Check if the values are candidates to demote.
2127 if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
2128 VecCost = ReuseShuffleCost +
2129 TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
2130 }
2131 return VecCost - ScalarCost;
2132 }
2133 case Instruction::FCmp:
2134 case Instruction::ICmp:
2135 case Instruction::Select: {
2136 // Calculate the cost of this instruction.
2137 if (NeedToShuffleReuses) {
2138 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
2139 TTI->getCmpSelInstrCost(S.Opcode, ScalarTy,
2140 Builder.getInt1Ty(), VL0);
2141 }
2142 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
2143 int ScalarCost = VecTy->getNumElements() *
2144 TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
2145 int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
2146 return ReuseShuffleCost + VecCost - ScalarCost;
2147 }
2148 case Instruction::Add:
2149 case Instruction::FAdd:
2150 case Instruction::Sub:
2151 case Instruction::FSub:
2152 case Instruction::Mul:
2153 case Instruction::FMul:
2154 case Instruction::UDiv:
2155 case Instruction::SDiv:
2156 case Instruction::FDiv:
2157 case Instruction::URem:
2158 case Instruction::SRem:
2159 case Instruction::FRem:
2160 case Instruction::Shl:
2161 case Instruction::LShr:
2162 case Instruction::AShr:
2163 case Instruction::And:
2164 case Instruction::Or:
2165 case Instruction::Xor: {
2166 // Certain instructions can be cheaper to vectorize if they have a
2167 // constant second vector operand.
2168 TargetTransformInfo::OperandValueKind Op1VK =
2169 TargetTransformInfo::OK_AnyValue;
2170 TargetTransformInfo::OperandValueKind Op2VK =
2171 TargetTransformInfo::OK_UniformConstantValue;
2172 TargetTransformInfo::OperandValueProperties Op1VP =
2173 TargetTransformInfo::OP_None;
2174 TargetTransformInfo::OperandValueProperties Op2VP =
2175 TargetTransformInfo::OP_None;
2176
2177 // If all operands are exactly the same ConstantInt then set the
2178 // operand kind to OK_UniformConstantValue.
2179 // If instead not all operands are constants, then set the operand kind
2180 // to OK_AnyValue. If all operands are constants but not the same,
2181 // then set the operand kind to OK_NonUniformConstantValue.
2182 ConstantInt *CInt = nullptr;
2183 for (unsigned i = 0; i < VL.size(); ++i) {
2184 const Instruction *I = cast<Instruction>(VL[i]);
2185 if (!isa<ConstantInt>(I->getOperand(1))) {
2186 Op2VK = TargetTransformInfo::OK_AnyValue;
2187 break;
2188 }
2189 if (i == 0) {
2190 CInt = cast<ConstantInt>(I->getOperand(1));
2191 continue;
2192 }
2193 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
2194 CInt != cast<ConstantInt>(I->getOperand(1)))
2195 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
2196 }
2197 // FIXME: Currently cost of model modification for division by power of
2198 // 2 is handled for X86 and AArch64. Add support for other targets.
2199 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
2200 CInt->getValue().isPowerOf2())
2201 Op2VP = TargetTransformInfo::OP_PowerOf2;
2202
2203 SmallVector<const Value *, 4> Operands(VL0->operand_values());
2204 if (NeedToShuffleReuses) {
2205 ReuseShuffleCost -=
2206 (ReuseShuffleNumbers - VL.size()) *
2207 TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
2208 Op2VP, Operands);
2209 }
2210 int ScalarCost =
2211 VecTy->getNumElements() *
2212 TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
2213 Op2VP, Operands);
2214 int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
2215 Op1VP, Op2VP, Operands);
2216 return ReuseShuffleCost + VecCost - ScalarCost;
2217 }
2218 case Instruction::GetElementPtr: {
2219 TargetTransformInfo::OperandValueKind Op1VK =
2220 TargetTransformInfo::OK_AnyValue;
2221 TargetTransformInfo::OperandValueKind Op2VK =
2222 TargetTransformInfo::OK_UniformConstantValue;
2223
2224 if (NeedToShuffleReuses) {
2225 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
2226 TTI->getArithmeticInstrCost(Instruction::Add,
2227 ScalarTy, Op1VK, Op2VK);
2228 }
2229 int ScalarCost =
2230 VecTy->getNumElements() *
2231 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
2232 int VecCost =
2233 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
2234
2235 return ReuseShuffleCost + VecCost - ScalarCost;
2236 }
2237 case Instruction::Load: {
2238 // Cost of wide load - cost of scalar loads.
2239 unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
2240 if (NeedToShuffleReuses) {
2241 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
2242 TTI->getMemoryOpCost(Instruction::Load, ScalarTy,
2243 alignment, 0, VL0);
2244 }
2245 int ScalarLdCost = VecTy->getNumElements() *
2246 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
2247 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
2248 VecTy, alignment, 0, VL0);
2249 if (!isConsecutiveAccess(VL[0], VL[1], *DL, *SE)) {
2250 VecLdCost += TTI->getShuffleCost(
2251 TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
2252 }
2253 return ReuseShuffleCost + VecLdCost - ScalarLdCost;
2254 }
2255 case Instruction::Store: {
2256 // We know that we can merge the stores. Calculate the cost.
2257 unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
2258 if (NeedToShuffleReuses) {
2259 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
2260 TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
2261 alignment, 0, VL0);
2262 }
2263 int ScalarStCost = VecTy->getNumElements() *
2264 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
2265 int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
2266 VecTy, alignment, 0, VL0);
2267 return ReuseShuffleCost + VecStCost - ScalarStCost;
2268 }
2269 case Instruction::Call: {
2270 CallInst *CI = cast<CallInst>(VL0);
2271 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2272
2273 // Calculate the cost of the scalar and vector calls.
2274 SmallVector<Type*, 4> ScalarTys;
2275 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
2276 ScalarTys.push_back(CI->getArgOperand(op)->getType());
2277
2278 FastMathFlags FMF;
2279 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2280 FMF = FPMO->getFastMathFlags();
2281
2282 if (NeedToShuffleReuses) {
2283 ReuseShuffleCost -=
2284 (ReuseShuffleNumbers - VL.size()) *
2285 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
2286 }
2287 int ScalarCallCost = VecTy->getNumElements() *
2288 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
2289
2290 SmallVector<Value *, 4> Args(CI->arg_operands());
2291 int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
2292 VecTy->getNumElements());
2293
2294 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost "<< VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
2295 << " (" << VecCallCost << "-" << ScalarCallCost << ")"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost "<< VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
2296 << " for " << *CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost "<< VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
;
2297
2298 return ReuseShuffleCost + VecCallCost - ScalarCallCost;
2299 }
2300 case Instruction::ShuffleVector: {
2301 TargetTransformInfo::OperandValueKind Op1VK =
2302 TargetTransformInfo::OK_AnyValue;
2303 TargetTransformInfo::OperandValueKind Op2VK =
2304 TargetTransformInfo::OK_AnyValue;
2305 int ScalarCost = 0;
2306 if (NeedToShuffleReuses) {
2307 for (unsigned Idx : E->ReuseShuffleIndices) {
2308 Instruction *I = cast<Instruction>(VL[Idx]);
2309 if (!I)
2310 continue;
2311 ReuseShuffleCost -= TTI->getArithmeticInstrCost(
2312 I->getOpcode(), ScalarTy, Op1VK, Op2VK);
2313 }
2314 for (Value *V : VL) {
2315 Instruction *I = cast<Instruction>(V);
2316 if (!I)
2317 continue;
2318 ReuseShuffleCost += TTI->getArithmeticInstrCost(
2319 I->getOpcode(), ScalarTy, Op1VK, Op2VK);
2320 }
2321 }
2322 int VecCost = 0;
2323 for (Value *i : VL) {
2324 Instruction *I = cast<Instruction>(i);
2325 if (!I)
2326 break;
2327 ScalarCost +=
2328 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
2329 }
2330 // VecCost is equal to sum of the cost of creating 2 vectors
2331 // and the cost of creating shuffle.
2332 Instruction *I0 = cast<Instruction>(VL[0]);
2333 VecCost =
2334 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
2335 Instruction *I1 = cast<Instruction>(VL[1]);
2336 VecCost +=
2337 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
2338 VecCost +=
2339 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
2340 return ReuseShuffleCost + VecCost - ScalarCost;
2341 }
2342 default:
2343 llvm_unreachable("Unknown instruction")::llvm::llvm_unreachable_internal("Unknown instruction", "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2343)
;
2344 }
2345}
2346
2347bool BoUpSLP::isFullyVectorizableTinyTree() {
2348 DEBUG(dbgs() << "SLP: Check whether the tree with height " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false)
2349 VectorizableTree.size() << " is fully vectorizable .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false)
;
2350
2351 // We only handle trees of heights 1 and 2.
2352 if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
2353 return true;
2354
2355 if (VectorizableTree.size() != 2)
2356 return false;
2357
2358 // Handle splat and all-constants stores.
2359 if (!VectorizableTree[0].NeedToGather &&
2360 (allConstant(VectorizableTree[1].Scalars) ||
2361 isSplat(VectorizableTree[1].Scalars)))
2362 return true;
2363
2364 // Gathering cost would be too much for tiny trees.
2365 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
2366 return false;
2367
2368 return true;
2369}
2370
2371bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
2372 // We can vectorize the tree if its size is greater than or equal to the
2373 // minimum size specified by the MinTreeSize command line option.
2374 if (VectorizableTree.size() >= MinTreeSize)
2375 return false;
2376
2377 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
2378 // can vectorize it if we can prove it fully vectorizable.
2379 if (isFullyVectorizableTinyTree())
2380 return false;
2381
2382 assert(VectorizableTree.empty()(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2384, __extension__ __PRETTY_FUNCTION__))
2383 ? ExternalUses.empty()(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2384, __extension__ __PRETTY_FUNCTION__))
2384 : true && "We shouldn't have any external users")(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2384, __extension__ __PRETTY_FUNCTION__))
;
2385
2386 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
2387 // vectorizable.
2388 return true;
2389}
2390
2391int BoUpSLP::getSpillCost() {
2392 // Walk from the bottom of the tree to the top, tracking which values are
2393 // live. When we see a call instruction that is not part of our tree,
2394 // query TTI to see if there is a cost to keeping values live over it
2395 // (for example, if spills and fills are required).
2396 unsigned BundleWidth = VectorizableTree.front().Scalars.size();
2397 int Cost = 0;
2398
2399 SmallPtrSet<Instruction*, 4> LiveValues;
2400 Instruction *PrevInst = nullptr;
2401
2402 for (const auto &N : VectorizableTree) {
2403 Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
2404 if (!Inst)
2405 continue;
2406
2407 if (!PrevInst) {
2408 PrevInst = Inst;
2409 continue;
2410 }
2411
2412 // Update LiveValues.
2413 LiveValues.erase(PrevInst);
2414 for (auto &J : PrevInst->operands()) {
2415 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
2416 LiveValues.insert(cast<Instruction>(&*J));
2417 }
2418
2419 DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2420 dbgs() << "SLP: #LV: " << LiveValues.size();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2421 for (auto *X : LiveValues)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2422 dbgs() << " " << X->getName();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2423 dbgs() << ", Looking at ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2424 Inst->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
2425 )do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)
;
2426
2427 // Now find the sequence of instructions between PrevInst and Inst.
2428 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
2429 PrevInstIt =
2430 PrevInst->getIterator().getReverse();
2431 while (InstIt != PrevInstIt) {
2432 if (PrevInstIt == PrevInst->getParent()->rend()) {
2433 PrevInstIt = Inst->getParent()->rbegin();
2434 continue;
2435 }
2436
2437 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
2438 SmallVector<Type*, 4> V;
2439 for (auto *II : LiveValues)
2440 V.push_back(VectorType::get(II->getType(), BundleWidth));
2441 Cost += TTI->getCostOfKeepingLiveOverCall(V);
2442 }
2443
2444 ++PrevInstIt;
2445 }
2446
2447 PrevInst = Inst;
2448 }
2449
2450 return Cost;
2451}
2452
2453int BoUpSLP::getTreeCost() {
2454 int Cost = 0;
2455 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n"; } } while (
false)
2456 VectorizableTree.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n"; } } while (
false)
;
2457
2458 unsigned BundleWidth = VectorizableTree[0].Scalars.size();
2459
2460 for (TreeEntry &TE : VectorizableTree) {
2461 int C = getEntryCost(&TE);
2462 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n"; } } while (false)
2463 << *TE.Scalars[0] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n"; } } while (false)
;
2464 Cost += C;
2465 }
2466
2467 SmallSet<Value *, 16> ExtractCostCalculated;
2468 int ExtractCost = 0;
2469 for (ExternalUser &EU : ExternalUses) {
2470 // We only add extract cost once for the same scalar.
2471 if (!ExtractCostCalculated.insert(EU.Scalar).second)
2472 continue;
2473
2474 // Uses by ephemeral values are free (because the ephemeral value will be
2475 // removed prior to code generation, and so the extraction will be
2476 // removed as well).
2477 if (EphValues.count(EU.User))
2478 continue;
2479
2480 // If we plan to rewrite the tree in a smaller type, we will need to sign
2481 // extend the extracted value back to the original type. Here, we account
2482 // for the extract and the added cost of the sign extend if needed.
2483 auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
2484 auto *ScalarRoot = VectorizableTree[0].Scalars[0];
2485 if (MinBWs.count(ScalarRoot)) {
2486 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
2487 auto Extend =
2488 MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
2489 VecTy = VectorType::get(MinTy, BundleWidth);
2490 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
2491 VecTy, EU.Lane);
2492 } else {
2493 ExtractCost +=
2494 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
2495 }
2496 }
2497
2498 int SpillCost = getSpillCost();
2499 Cost += SpillCost + ExtractCost;
2500
2501 std::string Str;
2502 {
2503 raw_string_ostream OS(Str);
2504 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
2505 << "SLP: Extract Cost = " << ExtractCost << ".\n"
2506 << "SLP: Total Cost = " << Cost << ".\n";
2507 }
2508 DEBUG(dbgs() << Str)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << Str; } } while (false)
;
2509
2510 if (ViewSLPTree)
2511 ViewGraph(this, "SLP" + F->getName(), false, Str);
2512
2513 return Cost;
2514}
2515
2516int BoUpSLP::getGatherCost(Type *Ty,
2517 const DenseSet<unsigned> &ShuffledIndices) {
2518 int Cost = 0;
2519 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
2520 if (!ShuffledIndices.count(i))
2521 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
2522 if (!ShuffledIndices.empty())
2523 Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
2524 return Cost;
2525}
2526
2527int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
2528 // Find the type of the operands in VL.
2529 Type *ScalarTy = VL[0]->getType();
2530 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2531 ScalarTy = SI->getValueOperand()->getType();
2532 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2533 // Find the cost of inserting/extracting values from the vector.
2534 // Check if the same elements are inserted several times and count them as
2535 // shuffle candidates.
2536 DenseSet<unsigned> ShuffledElements;
2537 DenseSet<Value *> UniqueElements;
2538 // Iterate in reverse order to consider insert elements with the high cost.
2539 for (unsigned I = VL.size(); I > 0; --I) {
2540 unsigned Idx = I - 1;
2541 if (!UniqueElements.insert(VL[Idx]).second)
2542 ShuffledElements.insert(Idx);
2543 }
2544 return getGatherCost(VecTy, ShuffledElements);
2545}
2546
2547// Reorder commutative operations in alternate shuffle if the resulting vectors
2548// are consecutive loads. This would allow us to vectorize the tree.
2549// If we have something like-
2550// load a[0] - load b[0]
2551// load b[1] + load a[1]
2552// load a[2] - load b[2]
2553// load a[3] + load b[3]
2554// Reordering the second load b[1] load a[1] would allow us to vectorize this
2555// code.
2556void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
2557 SmallVectorImpl<Value *> &Left,
2558 SmallVectorImpl<Value *> &Right) {
2559 // Push left and right operands of binary operation into Left and Right
2560 unsigned AltOpcode = getAltOpcode(Opcode);
2561 (void)AltOpcode;
2562 for (Value *V : VL) {
2563 auto *I = cast<Instruction>(V);
2564 assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&(static_cast <bool> (sameOpcodeOrAlt(Opcode, AltOpcode,
I->getOpcode()) && "Incorrect instruction in vector"
) ? void (0) : __assert_fail ("sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && \"Incorrect instruction in vector\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2565, __extension__ __PRETTY_FUNCTION__))
2565 "Incorrect instruction in vector")(static_cast <bool> (sameOpcodeOrAlt(Opcode, AltOpcode,
I->getOpcode()) && "Incorrect instruction in vector"
) ? void (0) : __assert_fail ("sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && \"Incorrect instruction in vector\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2565, __extension__ __PRETTY_FUNCTION__))
;
2566 Left.push_back(I->getOperand(0));
2567 Right.push_back(I->getOperand(1));
2568 }
2569
2570 // Reorder if we have a commutative operation and consecutive access
2571 // are on either side of the alternate instructions.
2572 for (unsigned j = 0; j < VL.size() - 1; ++j) {
2573 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2574 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2575 Instruction *VL1 = cast<Instruction>(VL[j]);
2576 Instruction *VL2 = cast<Instruction>(VL[j + 1]);
2577 if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
2578 std::swap(Left[j], Right[j]);
2579 continue;
2580 } else if (VL2->isCommutative() &&
2581 isConsecutiveAccess(L, L1, *DL, *SE)) {
2582 std::swap(Left[j + 1], Right[j + 1]);
2583 continue;
2584 }
2585 // else unchanged
2586 }
2587 }
2588 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2589 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2590 Instruction *VL1 = cast<Instruction>(VL[j]);
2591 Instruction *VL2 = cast<Instruction>(VL[j + 1]);
2592 if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
2593 std::swap(Left[j], Right[j]);
2594 continue;
2595 } else if (VL2->isCommutative() &&
2596 isConsecutiveAccess(L, L1, *DL, *SE)) {
2597 std::swap(Left[j + 1], Right[j + 1]);
2598 continue;
2599 }
2600 // else unchanged
2601 }
2602 }
2603 }
2604}
2605
2606// Return true if I should be commuted before adding it's left and right
2607// operands to the arrays Left and Right.
2608//
2609// The vectorizer is trying to either have all elements one side being
2610// instruction with the same opcode to enable further vectorization, or having
2611// a splat to lower the vectorizing cost.
2612static bool shouldReorderOperands(
2613 int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
2614 ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
2615 bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
2616 VLeft = I.getOperand(0);
2617 VRight = I.getOperand(1);
2618 // If we have "SplatRight", try to see if commuting is needed to preserve it.
2619 if (SplatRight) {
2620 if (VRight == Right[i - 1])
2621 // Preserve SplatRight
2622 return false;
2623 if (VLeft == Right[i - 1]) {
2624 // Commuting would preserve SplatRight, but we don't want to break
2625 // SplatLeft either, i.e. preserve the original order if possible.
2626 // (FIXME: why do we care?)
2627 if (SplatLeft && VLeft == Left[i - 1])
2628 return false;
2629 return true;
2630 }
2631 }
2632 // Symmetrically handle Right side.
2633 if (SplatLeft) {
2634 if (VLeft == Left[i - 1])
2635 // Preserve SplatLeft
2636 return false;
2637 if (VRight == Left[i - 1])
2638 return true;
2639 }
2640
2641 Instruction *ILeft = dyn_cast<Instruction>(VLeft);
2642 Instruction *IRight = dyn_cast<Instruction>(VRight);
2643
2644 // If we have "AllSameOpcodeRight", try to see if the left operands preserves
2645 // it and not the right, in this case we want to commute.
2646 if (AllSameOpcodeRight) {
2647 unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
2648 if (IRight && RightPrevOpcode == IRight->getOpcode())
2649 // Do not commute, a match on the right preserves AllSameOpcodeRight
2650 return false;
2651 if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
2652 // We have a match and may want to commute, but first check if there is
2653 // not also a match on the existing operands on the Left to preserve
2654 // AllSameOpcodeLeft, i.e. preserve the original order if possible.
2655 // (FIXME: why do we care?)
2656 if (AllSameOpcodeLeft && ILeft &&
2657 cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
2658 return false;
2659 return true;
2660 }
2661 }
2662 // Symmetrically handle Left side.
2663 if (AllSameOpcodeLeft) {
2664 unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
2665 if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
2666 return false;
2667 if (IRight && LeftPrevOpcode == IRight->getOpcode())
2668 return true;
2669 }
2670 return false;
2671}
2672
2673void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
2674 ArrayRef<Value *> VL,
2675 SmallVectorImpl<Value *> &Left,
2676 SmallVectorImpl<Value *> &Right) {
2677 if (!VL.empty()) {
2678 // Peel the first iteration out of the loop since there's nothing
2679 // interesting to do anyway and it simplifies the checks in the loop.
2680 auto *I = cast<Instruction>(VL[0]);
2681 Value *VLeft = I->getOperand(0);
2682 Value *VRight = I->getOperand(1);
2683 if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
2684 // Favor having instruction to the right. FIXME: why?
2685 std::swap(VLeft, VRight);
2686 Left.push_back(VLeft);
2687 Right.push_back(VRight);
2688 }
2689
2690 // Keep track if we have instructions with all the same opcode on one side.
2691 bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
2692 bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
2693 // Keep track if we have one side with all the same value (broadcast).
2694 bool SplatLeft = true;
2695 bool SplatRight = true;
2696
2697 for (unsigned i = 1, e = VL.size(); i != e; ++i) {
2698 Instruction *I = cast<Instruction>(VL[i]);
2699 assert(((I->getOpcode() == Opcode && I->isCommutative()) ||(static_cast <bool> (((I->getOpcode() == Opcode &&
I->isCommutative()) || (I->getOpcode() != Opcode &&
Instruction::isCommutative(Opcode))) && "Can only process commutative instruction"
) ? void (0) : __assert_fail ("((I->getOpcode() == Opcode && I->isCommutative()) || (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && \"Can only process commutative instruction\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2701, __extension__ __PRETTY_FUNCTION__))
2700 (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&(static_cast <bool> (((I->getOpcode() == Opcode &&
I->isCommutative()) || (I->getOpcode() != Opcode &&
Instruction::isCommutative(Opcode))) && "Can only process commutative instruction"
) ? void (0) : __assert_fail ("((I->getOpcode() == Opcode && I->isCommutative()) || (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && \"Can only process commutative instruction\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2701, __extension__ __PRETTY_FUNCTION__))
2701 "Can only process commutative instruction")(static_cast <bool> (((I->getOpcode() == Opcode &&
I->isCommutative()) || (I->getOpcode() != Opcode &&
Instruction::isCommutative(Opcode))) && "Can only process commutative instruction"
) ? void (0) : __assert_fail ("((I->getOpcode() == Opcode && I->isCommutative()) || (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && \"Can only process commutative instruction\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2701, __extension__ __PRETTY_FUNCTION__))
;
2702 // Commute to favor either a splat or maximizing having the same opcodes on
2703 // one side.
2704 Value *VLeft;
2705 Value *VRight;
2706 if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft,
2707 AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,
2708 VRight)) {
2709 Left.push_back(VRight);
2710 Right.push_back(VLeft);
2711 } else {
2712 Left.push_back(VLeft);
2713 Right.push_back(VRight);
2714 }
2715 // Update Splat* and AllSameOpcode* after the insertion.
2716 SplatRight = SplatRight && (Right[i - 1] == Right[i]);
2717 SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
2718 AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
2719 (cast<Instruction>(Left[i - 1])->getOpcode() ==
2720 cast<Instruction>(Left[i])->getOpcode());
2721 AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
2722 (cast<Instruction>(Right[i - 1])->getOpcode() ==
2723 cast<Instruction>(Right[i])->getOpcode());
2724 }
2725
2726 // If one operand end up being broadcast, return this operand order.
2727 if (SplatRight || SplatLeft)
2728 return;
2729
2730 // Finally check if we can get longer vectorizable chain by reordering
2731 // without breaking the good operand order detected above.
2732 // E.g. If we have something like-
2733 // load a[0] load b[0]
2734 // load b[1] load a[1]
2735 // load a[2] load b[2]
2736 // load a[3] load b[3]
2737 // Reordering the second load b[1] load a[1] would allow us to vectorize
2738 // this code and we still retain AllSameOpcode property.
2739 // FIXME: This load reordering might break AllSameOpcode in some rare cases
2740 // such as-
2741 // add a[0],c[0] load b[0]
2742 // add a[1],c[2] load b[1]
2743 // b[2] load b[2]
2744 // add a[3],c[3] load b[3]
2745 for (unsigned j = 0; j < VL.size() - 1; ++j) {
2746 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2747 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2748 if (isConsecutiveAccess(L, L1, *DL, *SE)) {
2749 std::swap(Left[j + 1], Right[j + 1]);
2750 continue;
2751 }
2752 }
2753 }
2754 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2755 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2756 if (isConsecutiveAccess(L, L1, *DL, *SE)) {
2757 std::swap(Left[j + 1], Right[j + 1]);
2758 continue;
2759 }
2760 }
2761 }
2762 // else unchanged
2763 }
2764}
2765
2766void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
2767 // Get the basic block this bundle is in. All instructions in the bundle
2768 // should be in this block.
2769 auto *Front = cast<Instruction>(OpValue);
2770 auto *BB = Front->getParent();
2771 const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode();
2772 const unsigned AltOpcode = getAltOpcode(Opcode);
2773 assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {(static_cast <bool> (llvm::all_of(make_range(VL.begin()
, VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt
(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode(
)) || cast<Instruction>(V)->getParent() == BB; })) ?
void (0) : __assert_fail ("llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode()) || cast<Instruction>(V)->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2777, __extension__ __PRETTY_FUNCTION__))
2774 return !sameOpcodeOrAlt(Opcode, AltOpcode,(static_cast <bool> (llvm::all_of(make_range(VL.begin()
, VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt
(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode(
)) || cast<Instruction>(V)->getParent() == BB; })) ?
void (0) : __assert_fail ("llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode()) || cast<Instruction>(V)->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2777, __extension__ __PRETTY_FUNCTION__))
2775 cast<Instruction>(V)->getOpcode()) ||(static_cast <bool> (llvm::all_of(make_range(VL.begin()
, VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt
(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode(
)) || cast<Instruction>(V)->getParent() == BB; })) ?
void (0) : __assert_fail ("llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode()) || cast<Instruction>(V)->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2777, __extension__ __PRETTY_FUNCTION__))
2776 cast<Instruction>(V)->getParent() == BB;(static_cast <bool> (llvm::all_of(make_range(VL.begin()
, VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt
(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode(
)) || cast<Instruction>(V)->getParent() == BB; })) ?
void (0) : __assert_fail ("llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode()) || cast<Instruction>(V)->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2777, __extension__ __PRETTY_FUNCTION__))
2777 }))(static_cast <bool> (llvm::all_of(make_range(VL.begin()
, VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt
(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode(
)) || cast<Instruction>(V)->getParent() == BB; })) ?
void (0) : __assert_fail ("llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { return !sameOpcodeOrAlt(Opcode, AltOpcode, cast<Instruction>(V)->getOpcode()) || cast<Instruction>(V)->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2777, __extension__ __PRETTY_FUNCTION__))
;
2778
2779 // The last instruction in the bundle in program order.
2780 Instruction *LastInst = nullptr;
2781
2782 // Find the last instruction. The common case should be that BB has been
2783 // scheduled, and the last instruction is VL.back(). So we start with
2784 // VL.back() and iterate over schedule data until we reach the end of the
2785 // bundle. The end of the bundle is marked by null ScheduleData.
2786 if (BlocksSchedules.count(BB)) {
2787 auto *Bundle =
2788 BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));
2789 if (Bundle && Bundle->isPartOfBundle())
2790 for (; Bundle; Bundle = Bundle->NextInBundle)
2791 if (Bundle->OpValue == Bundle->Inst)
2792 LastInst = Bundle->Inst;
2793 }
2794
2795 // LastInst can still be null at this point if there's either not an entry
2796 // for BB in BlocksSchedules or there's no ScheduleData available for
2797 // VL.back(). This can be the case if buildTree_rec aborts for various
2798 // reasons (e.g., the maximum recursion depth is reached, the maximum region
2799 // size is reached, etc.). ScheduleData is initialized in the scheduling
2800 // "dry-run".
2801 //
2802 // If this happens, we can still find the last instruction by brute force. We
2803 // iterate forwards from Front (inclusive) until we either see all
2804 // instructions in the bundle or reach the end of the block. If Front is the
2805 // last instruction in program order, LastInst will be set to Front, and we
2806 // will visit all the remaining instructions in the block.
2807 //
2808 // One of the reasons we exit early from buildTree_rec is to place an upper
2809 // bound on compile-time. Thus, taking an additional compile-time hit here is
2810 // not ideal. However, this should be exceedingly rare since it requires that
2811 // we both exit early from buildTree_rec and that the bundle be out-of-order
2812 // (causing us to iterate all the way to the end of the block).
2813 if (!LastInst) {
2814 SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
2815 for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
2816 if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))
2817 LastInst = &I;
2818 if (Bundle.empty())
2819 break;
2820 }
2821 }
2822
2823 // Set the insertion point after the last instruction in the bundle. Set the
2824 // debug location to Front.
2825 Builder.SetInsertPoint(BB, ++LastInst->getIterator());
2826 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
2827}
2828
2829Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
2830 Value *Vec = UndefValue::get(Ty);
2831 // Generate the 'InsertElement' instruction.
2832 for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
2833 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2834 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2835 GatherSeq.insert(Insrt);
2836 CSEBlocks.insert(Insrt->getParent());
2837
2838 // Add to our 'need-to-extract' list.
2839 if (TreeEntry *E = getTreeEntry(VL[i])) {
2840 // Find which lane we need to extract.
2841 int FoundLane = -1;
2842 for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
2843 // Is this the lane of the scalar that we are looking for ?
2844 if (E->Scalars[Lane] == VL[i]) {
2845 FoundLane = Lane;
2846 break;
2847 }
2848 }
2849 assert(FoundLane >= 0 && "Could not find the correct lane")(static_cast <bool> (FoundLane >= 0 && "Could not find the correct lane"
) ? void (0) : __assert_fail ("FoundLane >= 0 && \"Could not find the correct lane\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2849, __extension__ __PRETTY_FUNCTION__))
;
2850 if (!E->ReuseShuffleIndices.empty()) {
2851 FoundLane =
2852 std::distance(E->ReuseShuffleIndices.begin(),
2853 llvm::find(E->ReuseShuffleIndices, FoundLane));
2854 }
2855 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2856 }
2857 }
2858 }
2859
2860 return Vec;
2861}
2862
2863Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
2864 InstructionsState S = getSameOpcode(VL);
2865 if (S.Opcode) {
2866 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
2867 if (E->isSame(VL)) {
2868 Value *V = vectorizeTree(E);
2869 if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
2870 // We need to get the vectorized value but without shuffle.
2871 if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
2872 V = SV->getOperand(0);
2873 } else {
2874 // Reshuffle to get only unique values.
2875 SmallVector<unsigned, 4> UniqueIdxs;
2876 SmallSet<unsigned, 4> UsedIdxs;
2877 for(unsigned Idx : E->ReuseShuffleIndices)
2878 if (UsedIdxs.insert(Idx).second)
2879 UniqueIdxs.emplace_back(Idx);
2880 V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
2881 UniqueIdxs);
2882 }
2883 }
2884 return V;
2885 }
2886 }
2887 }
2888
2889 Type *ScalarTy = S.OpValue->getType();
2890 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
2891 ScalarTy = SI->getValueOperand()->getType();
2892
2893 // Check that every instruction appears once in this bundle.
2894 SmallVector<unsigned, 4> ReuseShuffleIndicies;
2895 SmallVector<Value *, 4> UniqueValues;
2896 if (VL.size() > 2) {
2897 DenseMap<Value *, unsigned> UniquePositions;
2898 for (Value *V : VL) {
2899 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
2900 ReuseShuffleIndicies.emplace_back(Res.first->second);
2901 if (Res.second || isa<Constant>(V))
2902 UniqueValues.emplace_back(V);
2903 }
2904 // Do not shuffle single element or if number of unique values is not power
2905 // of 2.
2906 if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
2907 !llvm::isPowerOf2_32(UniqueValues.size()))
2908 ReuseShuffleIndicies.clear();
2909 else
2910 VL = UniqueValues;
2911 }
2912 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2913
2914 Value *V = Gather(VL, VecTy);
2915 if (!ReuseShuffleIndicies.empty()) {
2916 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
2917 ReuseShuffleIndicies, "shuffle");
2918 if (auto *I = dyn_cast<Instruction>(V)) {
2919 GatherSeq.insert(I);
2920 CSEBlocks.insert(I->getParent());
2921 }
2922 }
2923 return V;
2924}
2925
2926Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2927 IRBuilder<>::InsertPointGuard Guard(Builder);
2928
2929 if (E->VectorizedValue) {
2930 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*E->Scalars[0] << ".\n"; } } while (false)
;
2931 return E->VectorizedValue;
2932 }
2933
2934 InstructionsState S = getSameOpcode(E->Scalars);
2935 Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2936 Type *ScalarTy = VL0->getType();
2937 if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
2938 ScalarTy = SI->getValueOperand()->getType();
2939 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
2940
2941 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
2942
2943 if (E->NeedToGather) {
2944 setInsertPointAfterBundle(E->Scalars, VL0);
2945 auto *V = Gather(E->Scalars, VecTy);
2946 if (NeedToShuffleReuses) {
2947 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
2948 E->ReuseShuffleIndices, "shuffle");
2949 if (auto *I = dyn_cast<Instruction>(V)) {
2950 GatherSeq.insert(I);
2951 CSEBlocks.insert(I->getParent());
2952 }
2953 }
2954 E->VectorizedValue = V;
2955 return V;
2956 }
2957
2958 unsigned ShuffleOrOp = S.IsAltShuffle ?
2959 (unsigned) Instruction::ShuffleVector : S.Opcode;
2960 switch (ShuffleOrOp) {
2961 case Instruction::PHI: {
2962 PHINode *PH = dyn_cast<PHINode>(VL0);
2963 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
2964 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2965 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
2966 Value *V = NewPhi;
2967 if (NeedToShuffleReuses) {
2968 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
2969 E->ReuseShuffleIndices, "shuffle");
2970 }
2971 E->VectorizedValue = V;
2972
2973 // PHINodes may have multiple entries from the same block. We want to
2974 // visit every block once.
2975 SmallSet<BasicBlock*, 4> VisitedBBs;
2976
2977 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2978 ValueList Operands;
2979 BasicBlock *IBB = PH->getIncomingBlock(i);
2980
2981 if (!VisitedBBs.insert(IBB).second) {
2982 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
2983 continue;
2984 }
2985
2986 // Prepare the operand vector.
2987 for (Value *V : E->Scalars)
2988 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2989
2990 Builder.SetInsertPoint(IBB->getTerminator());
2991 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2992 Value *Vec = vectorizeTree(Operands);
2993 NewPhi->addIncoming(Vec, IBB);
2994 }
2995
2996 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&(static_cast <bool> (NewPhi->getNumIncomingValues() ==
PH->getNumIncomingValues() && "Invalid number of incoming values"
) ? void (0) : __assert_fail ("NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && \"Invalid number of incoming values\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2997, __extension__ __PRETTY_FUNCTION__))
2997 "Invalid number of incoming values")(static_cast <bool> (NewPhi->getNumIncomingValues() ==
PH->getNumIncomingValues() && "Invalid number of incoming values"
) ? void (0) : __assert_fail ("NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && \"Invalid number of incoming values\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2997, __extension__ __PRETTY_FUNCTION__))
;
2998 return V;
2999 }
3000
3001 case Instruction::ExtractElement: {
3002 if (canReuseExtract(E->Scalars, VL0)) {
3003 Value *V = VL0->getOperand(0);
3004 if (NeedToShuffleReuses) {
3005 Builder.SetInsertPoint(VL0);
3006 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3007 E->ReuseShuffleIndices, "shuffle");
3008 }
3009 E->VectorizedValue = V;
3010 return V;
3011 }
3012 setInsertPointAfterBundle(E->Scalars, VL0);
3013 auto *V = Gather(E->Scalars, VecTy);
3014 if (NeedToShuffleReuses) {
3015 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3016 E->ReuseShuffleIndices, "shuffle");
3017 if (auto *I = dyn_cast<Instruction>(V)) {
3018 GatherSeq.insert(I);
3019 CSEBlocks.insert(I->getParent());
3020 }
3021 }
3022 E->VectorizedValue = V;
3023 return V;
3024 }
3025 case Instruction::ExtractValue: {
3026 if (canReuseExtract(E->Scalars, VL0)) {
3027 LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
3028 Builder.SetInsertPoint(LI);
3029 PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
3030 Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
3031 LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
3032 Value *NewV = propagateMetadata(V, E->Scalars);
3033 if (NeedToShuffleReuses) {
3034 NewV = Builder.CreateShuffleVector(
3035 NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
3036 }
3037 E->VectorizedValue = NewV;
3038 return NewV;
3039 }
3040 setInsertPointAfterBundle(E->Scalars, VL0);
3041 auto *V = Gather(E->Scalars, VecTy);
3042 if (NeedToShuffleReuses) {
3043 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3044 E->ReuseShuffleIndices, "shuffle");
3045 if (auto *I = dyn_cast<Instruction>(V)) {
3046 GatherSeq.insert(I);
3047 CSEBlocks.insert(I->getParent());
3048 }
3049 }
3050 E->VectorizedValue = V;
3051 return V;
3052 }
3053 case Instruction::ZExt:
3054 case Instruction::SExt:
3055 case Instruction::FPToUI:
3056 case Instruction::FPToSI:
3057 case Instruction::FPExt:
3058 case Instruction::PtrToInt:
3059 case Instruction::IntToPtr:
3060 case Instruction::SIToFP:
3061 case Instruction::UIToFP:
3062 case Instruction::Trunc:
3063 case Instruction::FPTrunc:
3064 case Instruction::BitCast: {
3065 ValueList INVL;
3066 for (Value *V : E->Scalars)
3067 INVL.push_back(cast<Instruction>(V)->getOperand(0));
3068
3069 setInsertPointAfterBundle(E->Scalars, VL0);
3070
3071 Value *InVec = vectorizeTree(INVL);
3072
3073 if (E->VectorizedValue) {
3074 DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
3075 return E->VectorizedValue;
3076 }
3077
3078 CastInst *CI = dyn_cast<CastInst>(VL0);
3079 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
3080 if (NeedToShuffleReuses) {
3081 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3082 E->ReuseShuffleIndices, "shuffle");
3083 }
3084 E->VectorizedValue = V;
3085 ++NumVectorInstructions;
3086 return V;
3087 }
3088 case Instruction::FCmp:
3089 case Instruction::ICmp: {
3090 ValueList LHSV, RHSV;
3091 for (Value *V : E->Scalars) {
3092 LHSV.push_back(cast<Instruction>(V)->getOperand(0));
3093 RHSV.push_back(cast<Instruction>(V)->getOperand(1));
3094 }
3095
3096 setInsertPointAfterBundle(E->Scalars, VL0);
3097
3098 Value *L = vectorizeTree(LHSV);
3099 Value *R = vectorizeTree(RHSV);
3100
3101 if (E->VectorizedValue) {
3102 DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
3103 return E->VectorizedValue;
3104 }
3105
3106 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
3107 Value *V;
3108 if (S.Opcode == Instruction::FCmp)
3109 V = Builder.CreateFCmp(P0, L, R);
3110 else
3111 V = Builder.CreateICmp(P0, L, R);
3112
3113 propagateIRFlags(V, E->Scalars, VL0);
3114 if (NeedToShuffleReuses) {
3115 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3116 E->ReuseShuffleIndices, "shuffle");
3117 }
3118 E->VectorizedValue = V;
3119 ++NumVectorInstructions;
3120 return V;
3121 }
3122 case Instruction::Select: {
3123 ValueList TrueVec, FalseVec, CondVec;
3124 for (Value *V : E->Scalars) {
3125 CondVec.push_back(cast<Instruction>(V)->getOperand(0));
3126 TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
3127 FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
3128 }
3129
3130 setInsertPointAfterBundle(E->Scalars, VL0);
3131
3132 Value *Cond = vectorizeTree(CondVec);
3133 Value *True = vectorizeTree(TrueVec);
3134 Value *False = vectorizeTree(FalseVec);
3135
3136 if (E->VectorizedValue) {
3137 DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
3138 return E->VectorizedValue;
3139 }
3140
3141 Value *V = Builder.CreateSelect(Cond, True, False);
3142 if (NeedToShuffleReuses) {
3143 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3144 E->ReuseShuffleIndices, "shuffle");
3145 }
3146 E->VectorizedValue = V;
3147 ++NumVectorInstructions;
3148 return V;
3149 }
3150 case Instruction::Add:
3151 case Instruction::FAdd:
3152 case Instruction::Sub:
3153 case Instruction::FSub:
3154 case Instruction::Mul:
3155 case Instruction::FMul:
3156 case Instruction::UDiv:
3157 case Instruction::SDiv:
3158 case Instruction::FDiv:
3159 case Instruction::URem:
3160 case Instruction::SRem:
3161 case Instruction::FRem:
3162 case Instruction::Shl:
3163 case Instruction::LShr:
3164 case Instruction::AShr:
3165 case Instruction::And:
3166 case Instruction::Or:
3167 case Instruction::Xor: {
3168 ValueList LHSVL, RHSVL;
3169 if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
3170 reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
3171 RHSVL);
3172 else
3173 for (Value *V : E->Scalars) {
3174 auto *I = cast<Instruction>(V);
3175 LHSVL.push_back(I->getOperand(0));
3176 RHSVL.push_back(I->getOperand(1));
3177 }
3178
3179 setInsertPointAfterBundle(E->Scalars, VL0);
3180
3181 Value *LHS = vectorizeTree(LHSVL);
3182 Value *RHS = vectorizeTree(RHSVL);
3183
3184 if (E->VectorizedValue) {
3185 DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
3186 return E->VectorizedValue;
3187 }
3188
3189 Value *V = Builder.CreateBinOp(
3190 static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
3191 propagateIRFlags(V, E->Scalars, VL0);
3192 if (auto *I = dyn_cast<Instruction>(V))
3193 V = propagateMetadata(I, E->Scalars);
3194
3195 if (NeedToShuffleReuses) {
3196 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3197 E->ReuseShuffleIndices, "shuffle");
3198 }
3199 E->VectorizedValue = V;
3200 ++NumVectorInstructions;
3201
3202 return V;
3203 }
3204 case Instruction::Load: {
3205 // Loads are inserted at the head of the tree because we don't want to
3206 // sink them all the way down past store instructions.
3207 bool IsReversed =
3208 !isConsecutiveAccess(E->Scalars[0], E->Scalars[1], *DL, *SE);
3209 if (IsReversed)
3210 VL0 = cast<Instruction>(E->Scalars.back());
3211 setInsertPointAfterBundle(E->Scalars, VL0);
3212
3213 LoadInst *LI = cast<LoadInst>(VL0);
3214 Type *ScalarLoadTy = LI->getType();
3215 unsigned AS = LI->getPointerAddressSpace();
3216
3217 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
3218 VecTy->getPointerTo(AS));
3219
3220 // The pointer operand uses an in-tree scalar so we add the new BitCast to
3221 // ExternalUses list to make sure that an extract will be generated in the
3222 // future.
3223 Value *PO = LI->getPointerOperand();
3224 if (getTreeEntry(PO))
3225 ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
3226
3227 unsigned Alignment = LI->getAlignment();
3228 LI = Builder.CreateLoad(VecPtr);
3229 if (!Alignment) {
3230 Alignment = DL->getABITypeAlignment(ScalarLoadTy);
3231 }
3232 LI->setAlignment(Alignment);
3233 Value *V = propagateMetadata(LI, E->Scalars);
3234 if (IsReversed) {
3235 SmallVector<uint32_t, 4> Mask(E->Scalars.size());
3236 std::iota(Mask.rbegin(), Mask.rend(), 0);
3237 V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask);
3238 }
3239 if (NeedToShuffleReuses) {
3240 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3241 E->ReuseShuffleIndices, "shuffle");
3242 }
3243 E->VectorizedValue = V;
3244 ++NumVectorInstructions;
3245 return V;
3246 }
3247 case Instruction::Store: {
3248 StoreInst *SI = cast<StoreInst>(VL0);
3249 unsigned Alignment = SI->getAlignment();
3250 unsigned AS = SI->getPointerAddressSpace();
3251
3252 ValueList ScalarStoreValues;
3253 for (Value *V : E->Scalars)
3254 ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
3255
3256 setInsertPointAfterBundle(E->Scalars, VL0);
3257
3258 Value *VecValue = vectorizeTree(ScalarStoreValues);
3259 Value *ScalarPtr = SI->getPointerOperand();
3260 Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
3261 StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
3262
3263 // The pointer operand uses an in-tree scalar, so add the new BitCast to
3264 // ExternalUses to make sure that an extract will be generated in the
3265 // future.
3266 if (getTreeEntry(ScalarPtr))
3267 ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
3268
3269 if (!Alignment)
3270 Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
3271
3272 S->setAlignment(Alignment);
3273 Value *V = propagateMetadata(S, E->Scalars);
3274 if (NeedToShuffleReuses) {
3275 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3276 E->ReuseShuffleIndices, "shuffle");
3277 }
3278 E->VectorizedValue = V;
3279 ++NumVectorInstructions;
3280 return V;
3281 }
3282 case Instruction::GetElementPtr: {
3283 setInsertPointAfterBundle(E->Scalars, VL0);
3284
3285 ValueList Op0VL;
3286 for (Value *V : E->Scalars)
3287 Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
3288
3289 Value *Op0 = vectorizeTree(Op0VL);
3290
3291 std::vector<Value *> OpVecs;
3292 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
3293 ++j) {
3294 ValueList OpVL;
3295 for (Value *V : E->Scalars)
3296 OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
3297
3298 Value *OpVec = vectorizeTree(OpVL);
3299 OpVecs.push_back(OpVec);
3300 }
3301
3302 Value *V = Builder.CreateGEP(
3303 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
3304 if (Instruction *I = dyn_cast<Instruction>(V))
3305 V = propagateMetadata(I, E->Scalars);
3306
3307 if (NeedToShuffleReuses) {
3308 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3309 E->ReuseShuffleIndices, "shuffle");
3310 }
3311 E->VectorizedValue = V;
3312 ++NumVectorInstructions;
3313
3314 return V;
3315 }
3316 case Instruction::Call: {
3317 CallInst *CI = cast<CallInst>(VL0);
3318 setInsertPointAfterBundle(E->Scalars, VL0);
3319 Function *FI;
3320 Intrinsic::ID IID = Intrinsic::not_intrinsic;
3321 Value *ScalarArg = nullptr;
3322 if (CI && (FI = CI->getCalledFunction())) {
3323 IID = FI->getIntrinsicID();
3324 }
3325 std::vector<Value *> OpVecs;
3326 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
3327 ValueList OpVL;
3328 // ctlz,cttz and powi are special intrinsics whose second argument is
3329 // a scalar. This argument should not be vectorized.
3330 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
3331 CallInst *CEI = cast<CallInst>(VL0);
3332 ScalarArg = CEI->getArgOperand(j);
3333 OpVecs.push_back(CEI->getArgOperand(j));
3334 continue;
3335 }
3336 for (Value *V : E->Scalars) {
3337 CallInst *CEI = cast<CallInst>(V);
3338 OpVL.push_back(CEI->getArgOperand(j));
3339 }
3340
3341 Value *OpVec = vectorizeTree(OpVL);
3342 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: OpVec[" << j << "]: "
<< *OpVec << "\n"; } } while (false)
;
3343 OpVecs.push_back(OpVec);
3344 }
3345
3346 Module *M = F->getParent();
3347 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3348 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
3349 Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
3350 SmallVector<OperandBundleDef, 1> OpBundles;
3351 CI->getOperandBundlesAsDefs(OpBundles);
3352 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
3353
3354 // The scalar argument uses an in-tree scalar so we add the new vectorized
3355 // call to ExternalUses list to make sure that an extract will be
3356 // generated in the future.
3357 if (ScalarArg && getTreeEntry(ScalarArg))
3358 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
3359
3360 propagateIRFlags(V, E->Scalars, VL0);
3361 if (NeedToShuffleReuses) {
3362 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3363 E->ReuseShuffleIndices, "shuffle");
3364 }
3365 E->VectorizedValue = V;
3366 ++NumVectorInstructions;
3367 return V;
3368 }
3369 case Instruction::ShuffleVector: {
3370 ValueList LHSVL, RHSVL;
3371 assert(Instruction::isBinaryOp(S.Opcode) &&(static_cast <bool> (Instruction::isBinaryOp(S.Opcode) &&
"Invalid Shuffle Vector Operand") ? void (0) : __assert_fail
("Instruction::isBinaryOp(S.Opcode) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3372, __extension__ __PRETTY_FUNCTION__))
3372 "Invalid Shuffle Vector Operand")(static_cast <bool> (Instruction::isBinaryOp(S.Opcode) &&
"Invalid Shuffle Vector Operand") ? void (0) : __assert_fail
("Instruction::isBinaryOp(S.Opcode) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3372, __extension__ __PRETTY_FUNCTION__))
;
3373 reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
3374 setInsertPointAfterBundle(E->Scalars, VL0);
3375
3376 Value *LHS = vectorizeTree(LHSVL);
3377 Value *RHS = vectorizeTree(RHSVL);
3378
3379 if (E->VectorizedValue) {
3380 DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
3381 return E->VectorizedValue;
3382 }
3383
3384 // Create a vector of LHS op1 RHS
3385 Value *V0 = Builder.CreateBinOp(
3386 static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
3387
3388 unsigned AltOpcode = getAltOpcode(S.Opcode);
3389 // Create a vector of LHS op2 RHS
3390 Value *V1 = Builder.CreateBinOp(
3391 static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
3392
3393 // Create shuffle to take alternate operations from the vector.
3394 // Also, gather up odd and even scalar ops to propagate IR flags to
3395 // each vector operation.
3396 ValueList OddScalars, EvenScalars;
3397 unsigned e = E->Scalars.size();
3398 SmallVector<Constant *, 8> Mask(e);
3399 for (unsigned i = 0; i < e; ++i) {
3400 if (isOdd(i)) {
3401 Mask[i] = Builder.getInt32(e + i);
3402 OddScalars.push_back(E->Scalars[i]);
3403 } else {
3404 Mask[i] = Builder.getInt32(i);
3405 EvenScalars.push_back(E->Scalars[i]);
3406 }
3407 }
3408
3409 Value *ShuffleMask = ConstantVector::get(Mask);
3410 propagateIRFlags(V0, EvenScalars);
3411 propagateIRFlags(V1, OddScalars);
3412
3413 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
3414 if (Instruction *I = dyn_cast<Instruction>(V))
3415 V = propagateMetadata(I, E->Scalars);
3416 if (NeedToShuffleReuses) {
3417 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3418 E->ReuseShuffleIndices, "shuffle");
3419 }
3420 E->VectorizedValue = V;
3421 ++NumVectorInstructions;
3422
3423 return V;
3424 }
3425 default:
3426 llvm_unreachable("unknown inst")::llvm::llvm_unreachable_internal("unknown inst", "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3426)
;
3427 }
3428 return nullptr;
3429}
3430
3431Value *BoUpSLP::vectorizeTree() {
3432 ExtraValueToDebugLocsMap ExternallyUsedValues;
3433 return vectorizeTree(ExternallyUsedValues);
3434}
3435
3436Value *
3437BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
3438 // All blocks must be scheduled before any instructions are inserted.
3439 for (auto &BSIter : BlocksSchedules) {
3440 scheduleBlock(BSIter.second.get());
3441 }
3442
3443 Builder.SetInsertPoint(&F->getEntryBlock().front());
3444 auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
3445
3446 // If the vectorized tree can be rewritten in a smaller type, we truncate the
3447 // vectorized root. InstCombine will then rewrite the entire expression. We
3448 // sign extend the extracted values below.
3449 auto *ScalarRoot = VectorizableTree[0].Scalars[0];
3450 if (MinBWs.count(ScalarRoot)) {
3451 if (auto *I = dyn_cast<Instruction>(VectorRoot))
3452 Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
3453 auto BundleWidth = VectorizableTree[0].Scalars.size();
3454 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
3455 auto *VecTy = VectorType::get(MinTy, BundleWidth);
3456 auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
3457 VectorizableTree[0].VectorizedValue = Trunc;
3458 }
3459
3460 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Extracting " << ExternalUses
.size() << " values .\n"; } } while (false)
;
3461
3462 // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
3463 // specified by ScalarType.
3464 auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
3465 if (!MinBWs.count(ScalarRoot))
3466 return Ex;
3467 if (MinBWs[ScalarRoot].second)
3468 return Builder.CreateSExt(Ex, ScalarType);
3469 return Builder.CreateZExt(Ex, ScalarType);
3470 };
3471
3472 // Extract all of the elements with the external uses.
3473 for (const auto &ExternalUse : ExternalUses) {
3474 Value *Scalar = ExternalUse.Scalar;
3475 llvm::User *User = ExternalUse.User;
3476
3477 // Skip users that we already RAUW. This happens when one instruction
3478 // has multiple uses of the same value.
3479 if (User && !is_contained(Scalar->users(), User))
3480 continue;
3481 TreeEntry *E = getTreeEntry(Scalar);
3482 assert(E && "Invalid scalar")(static_cast <bool> (E && "Invalid scalar") ? void
(0) : __assert_fail ("E && \"Invalid scalar\"", "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3482, __extension__ __PRETTY_FUNCTION__))
;
3483 assert(!E->NeedToGather && "Extracting from a gather list")(static_cast <bool> (!E->NeedToGather && "Extracting from a gather list"
) ? void (0) : __assert_fail ("!E->NeedToGather && \"Extracting from a gather list\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3483, __extension__ __PRETTY_FUNCTION__))
;
3484
3485 Value *Vec = E->VectorizedValue;
3486 assert(Vec && "Can't find vectorizable value")(static_cast <bool> (Vec && "Can't find vectorizable value"
) ? void (0) : __assert_fail ("Vec && \"Can't find vectorizable value\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3486, __extension__ __PRETTY_FUNCTION__))
;
3487
3488 Value *Lane = Builder.getInt32(ExternalUse.Lane);
3489 // If User == nullptr, the Scalar is used as extra arg. Generate
3490 // ExtractElement instruction and update the record for this scalar in
3491 // ExternallyUsedValues.
3492 if (!User) {
3493 assert(ExternallyUsedValues.count(Scalar) &&(static_cast <bool> (ExternallyUsedValues.count(Scalar)
&& "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? void (0) : __assert_fail ("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3495, __extension__ __PRETTY_FUNCTION__))
3494 "Scalar with nullptr as an external user must be registered in "(static_cast <bool> (ExternallyUsedValues.count(Scalar)
&& "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? void (0) : __assert_fail ("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3495, __extension__ __PRETTY_FUNCTION__))
3495 "ExternallyUsedValues map")(static_cast <bool> (ExternallyUsedValues.count(Scalar)
&& "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? void (0) : __assert_fail ("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3495, __extension__ __PRETTY_FUNCTION__))
;
3496 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
3497 Builder.SetInsertPoint(VecI->getParent(),
3498 std::next(VecI->getIterator()));
3499 } else {
3500 Builder.SetInsertPoint(&F->getEntryBlock().front());
3501 }
3502 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3503 Ex = extend(ScalarRoot, Ex, Scalar->getType());
3504 CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
3505 auto &Locs = ExternallyUsedValues[Scalar];
3506 ExternallyUsedValues.insert({Ex, Locs});
3507 ExternallyUsedValues.erase(Scalar);
3508 continue;
3509 }
3510
3511 // Generate extracts for out-of-tree users.
3512 // Find the insertion point for the extractelement lane.
3513 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
3514 if (PHINode *PH = dyn_cast<PHINode>(User)) {
3515 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
3516 if (PH->getIncomingValue(i) == Scalar) {
3517 TerminatorInst *IncomingTerminator =
3518 PH->getIncomingBlock(i)->getTerminator();
3519 if (isa<CatchSwitchInst>(IncomingTerminator)) {
3520 Builder.SetInsertPoint(VecI->getParent(),
3521 std::next(VecI->getIterator()));
3522 } else {
3523 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
3524 }
3525 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3526 Ex = extend(ScalarRoot, Ex, Scalar->getType());
3527 CSEBlocks.insert(PH->getIncomingBlock(i));
3528 PH->setOperand(i, Ex);
3529 }
3530 }
3531 } else {
3532 Builder.SetInsertPoint(cast<Instruction>(User));
3533 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3534 Ex = extend(ScalarRoot, Ex, Scalar->getType());
3535 CSEBlocks.insert(cast<Instruction>(User)->getParent());
3536 User->replaceUsesOfWith(Scalar, Ex);
3537 }
3538 } else {
3539 Builder.SetInsertPoint(&F->getEntryBlock().front());
3540 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3541 Ex = extend(ScalarRoot, Ex, Scalar->getType());
3542 CSEBlocks.insert(&F->getEntryBlock());
3543 User->replaceUsesOfWith(Scalar, Ex);
3544 }
3545
3546 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Replaced:" << *User <<
".\n"; } } while (false)
;
3547 }
3548
3549 // For each vectorized value:
3550 for (TreeEntry &EIdx : VectorizableTree) {
3551 TreeEntry *Entry = &EIdx;
3552
3553 // No need to handle users of gathered values.
3554 if (Entry->NeedToGather)
3555 continue;
3556
3557 assert(Entry->VectorizedValue && "Can't find vectorizable value")(static_cast <bool> (Entry->VectorizedValue &&
"Can't find vectorizable value") ? void (0) : __assert_fail (
"Entry->VectorizedValue && \"Can't find vectorizable value\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3557, __extension__ __PRETTY_FUNCTION__))
;
3558
3559 // For each lane:
3560 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
3561 Value *Scalar = Entry->Scalars[Lane];
3562
3563 Type *Ty = Scalar->getType();
3564 if (!Ty->isVoidTy()) {
3565#ifndef NDEBUG
3566 for (User *U : Scalar->users()) {
3567 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tvalidating user:" <<
*U << ".\n"; } } while (false)
;
3568
3569 // It is legal to replace users in the ignorelist by undef.
3570 assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&(static_cast <bool> ((getTreeEntry(U) || is_contained(UserIgnoreList
, U)) && "Replacing out-of-tree value with undef") ? void
(0) : __assert_fail ("(getTreeEntry(U) || is_contained(UserIgnoreList, U)) && \"Replacing out-of-tree value with undef\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3571, __extension__ __PRETTY_FUNCTION__))
3571 "Replacing out-of-tree value with undef")(static_cast <bool> ((getTreeEntry(U) || is_contained(UserIgnoreList
, U)) && "Replacing out-of-tree value with undef") ? void
(0) : __assert_fail ("(getTreeEntry(U) || is_contained(UserIgnoreList, U)) && \"Replacing out-of-tree value with undef\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3571, __extension__ __PRETTY_FUNCTION__))
;
3572 }
3573#endif
3574 Value *Undef = UndefValue::get(Ty);
3575 Scalar->replaceAllUsesWith(Undef);
3576 }
3577 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tErasing scalar:" << *
Scalar << ".\n"; } } while (false)
;
3578 eraseInstruction(cast<Instruction>(Scalar));
3579 }
3580 }
3581
3582 Builder.ClearInsertionPoint();
3583
3584 return VectorizableTree[0].VectorizedValue;
3585}
3586
3587void BoUpSLP::optimizeGatherSequence() {
3588 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Optimizing " << GatherSeq
.size() << " gather sequences instructions.\n"; } } while
(false)
3589 << " gather sequences instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Optimizing " << GatherSeq
.size() << " gather sequences instructions.\n"; } } while
(false)
;
3590 // LICM InsertElementInst sequences.
3591 for (Instruction *I : GatherSeq) {
3592 if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
3593 continue;
3594
3595 // Check if this block is inside a loop.
3596 Loop *L = LI->getLoopFor(I->getParent());
3597 if (!L)
3598 continue;
3599
3600 // Check if it has a preheader.
3601 BasicBlock *PreHeader = L->getLoopPreheader();
3602 if (!PreHeader)
3603 continue;
3604
3605 // If the vector or the element that we insert into it are
3606 // instructions that are defined in this basic block then we can't
3607 // hoist this instruction.
3608 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
3609 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
3610 if (Op0 && L->contains(Op0))
3611 continue;
3612 if (Op1 && L->contains(Op1))
3613 continue;
3614
3615 // We can hoist this instruction. Move it to the pre-header.
3616 I->moveBefore(PreHeader->getTerminator());
3617 }
3618
3619 // Make a list of all reachable blocks in our CSE queue.
3620 SmallVector<const DomTreeNode *, 8> CSEWorkList;
3621 CSEWorkList.reserve(CSEBlocks.size());
3622 for (BasicBlock *BB : CSEBlocks)
3623 if (DomTreeNode *N = DT->getNode(BB)) {
3624 assert(DT->isReachableFromEntry(N))(static_cast <bool> (DT->isReachableFromEntry(N)) ? void
(0) : __assert_fail ("DT->isReachableFromEntry(N)", "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3624, __extension__ __PRETTY_FUNCTION__))
;
3625 CSEWorkList.push_back(N);
3626 }
3627
3628 // Sort blocks by domination. This ensures we visit a block after all blocks
3629 // dominating it are visited.
3630 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
3631 [this](const DomTreeNode *A, const DomTreeNode *B) {
3632 return DT->properlyDominates(A, B);
3633 });
3634
3635 // Perform O(N^2) search over the gather sequences and merge identical
3636 // instructions. TODO: We can further optimize this scan if we split the
3637 // instructions into different buckets based on the insert lane.
3638 SmallVector<Instruction *, 16> Visited;
3639 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
21
Assuming 'I' is not equal to 'E'
22
Loop condition is true. Entering loop body
24
Loop condition is true. Entering loop body
26
Loop condition is true. Entering loop body
28
Loop condition is true. Entering loop body
3640 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&(static_cast <bool> ((I == CSEWorkList.begin() || !DT->
dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"
) ? void (0) : __assert_fail ("(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3641, __extension__ __PRETTY_FUNCTION__))
29
Within the expansion of the macro 'assert':
a
Calling 'DominatorTreeBase::dominates'
b
Returning from 'DominatorTreeBase::dominates'
3641 "Worklist not sorted properly!")(static_cast <bool> ((I == CSEWorkList.begin() || !DT->
dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"
) ? void (0) : __assert_fail ("(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3641, __extension__ __PRETTY_FUNCTION__))
;
3642 BasicBlock *BB = (*I)->getBlock();
37
Called C++ object pointer is null
3643 // For all instructions in blocks containing gather sequences:
3644 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
23
Loop condition is false. Execution continues on line 3639
25
Loop condition is false. Execution continues on line 3639
27
Loop condition is false. Execution continues on line 3639
3645 Instruction *In = &*it++;
3646 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
3647 continue;
3648
3649 // Check if we can replace this instruction with any of the
3650 // visited instructions.
3651 for (Instruction *v : Visited) {
3652 if (In->isIdenticalTo(v) &&
3653 DT->dominates(v->getParent(), In->getParent())) {
3654 In->replaceAllUsesWith(v);
3655 eraseInstruction(In);
3656 In = nullptr;
3657 break;
3658 }
3659 }
3660 if (In) {
3661 assert(!is_contained(Visited, In))(static_cast <bool> (!is_contained(Visited, In)) ? void
(0) : __assert_fail ("!is_contained(Visited, In)", "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3661, __extension__ __PRETTY_FUNCTION__))
;
3662 Visited.push_back(In);
3663 }
3664 }
3665 }
3666 CSEBlocks.clear();
3667 GatherSeq.clear();
3668}
3669
3670// Groups the instructions to a bundle (which is then a single scheduling entity)
3671// and schedules instructions until the bundle gets ready.
3672bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
3673 BoUpSLP *SLP, Value *OpValue) {
3674 if (isa<PHINode>(OpValue))
3675 return true;
3676
3677 // Initialize the instruction bundle.
3678 Instruction *OldScheduleEnd = ScheduleEnd;
3679 ScheduleData *PrevInBundle = nullptr;
3680 ScheduleData *Bundle = nullptr;
3681 bool ReSchedule = false;
3682 DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle: " << *OpValue
<< "\n"; } } while (false)
;
3683
3684 // Make sure that the scheduling region contains all
3685 // instructions of the bundle.
3686 for (Value *V : VL) {
3687 if (!extendSchedulingRegion(V, OpValue))
3688 return false;
3689 }
3690
3691 for (Value *V : VL) {
3692 ScheduleData *BundleMember = getScheduleData(V);
3693 assert(BundleMember &&(static_cast <bool> (BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"
) ? void (0) : __assert_fail ("BundleMember && \"no ScheduleData for bundle member (maybe not in same basic block)\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3694, __extension__ __PRETTY_FUNCTION__))
3694 "no ScheduleData for bundle member (maybe not in same basic block)")(static_cast <bool> (BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"
) ? void (0) : __assert_fail ("BundleMember && \"no ScheduleData for bundle member (maybe not in same basic block)\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3694, __extension__ __PRETTY_FUNCTION__))
;
3695 if (BundleMember->IsScheduled) {
3696 // A bundle member was scheduled as single instruction before and now
3697 // needs to be scheduled as part of the bundle. We just get rid of the
3698 // existing schedule.
3699 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMemberdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: reset schedule because " <<
*BundleMember << " was already scheduled\n"; } } while
(false)
3700 << " was already scheduled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: reset schedule because " <<
*BundleMember << " was already scheduled\n"; } } while
(false)
;
3701 ReSchedule = true;
3702 }
3703 assert(BundleMember->isSchedulingEntity() &&(static_cast <bool> (BundleMember->isSchedulingEntity
() && "bundle member already part of other bundle") ?
void (0) : __assert_fail ("BundleMember->isSchedulingEntity() && \"bundle member already part of other bundle\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3704, __extension__ __PRETTY_FUNCTION__))
3704 "bundle member already part of other bundle")(static_cast <bool> (BundleMember->isSchedulingEntity
() && "bundle member already part of other bundle") ?
void (0) : __assert_fail ("BundleMember->isSchedulingEntity() && \"bundle member already part of other bundle\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3704, __extension__ __PRETTY_FUNCTION__))
;
3705 if (PrevInBundle) {
3706 PrevInBundle->NextInBundle = BundleMember;
3707 } else {
3708 Bundle = BundleMember;
3709 }
3710 BundleMember->UnscheduledDepsInBundle = 0;
3711 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
3712
3713 // Group the instructions to a bundle.
3714 BundleMember->FirstInBundle = Bundle;
3715 PrevInBundle = BundleMember;
3716 }
3717 if (ScheduleEnd != OldScheduleEnd) {
3718 // The scheduling region got new instructions at the lower end (or it is a
3719 // new region for the first bundle). This makes it necessary to
3720 // recalculate all dependencies.
3721 // It is seldom that this needs to be done a second time after adding the
3722 // initial bundle to the region.
3723 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3724 doForAllOpcodes(I, [](ScheduleData *SD) {
3725 SD->clearDependencies();
3726 });
3727 }
3728 ReSchedule = true;
3729 }
3730 if (ReSchedule) {
3731 resetSchedule();
3732 initialFillReadyList(ReadyInsts);
3733 }
3734
3735 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: try schedule bundle " <<
*Bundle << " in block " << BB->getName() <<
"\n"; } } while (false)
3736 << BB->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: try schedule bundle " <<
*Bundle << " in block " << BB->getName() <<
"\n"; } } while (false)
;
3737
3738 calculateDependencies(Bundle, true, SLP);
3739
3740 // Now try to schedule the new bundle. As soon as the bundle is "ready" it
3741 // means that there are no cyclic dependencies and we can schedule it.
3742 // Note that's important that we don't "schedule" the bundle yet (see
3743 // cancelScheduling).
3744 while (!Bundle->isReady() && !ReadyInsts.empty()) {
3745
3746 ScheduleData *pickedSD = ReadyInsts.back();
3747 ReadyInsts.pop_back();
3748
3749 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
3750 schedule(pickedSD, ReadyInsts);
3751 }
3752 }
3753 if (!Bundle->isReady()) {
3754 cancelScheduling(VL, OpValue);
3755 return false;
3756 }
3757 return true;
3758}
3759
3760void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
3761 Value *OpValue) {
3762 if (isa<PHINode>(OpValue))
3763 return;
3764
3765 ScheduleData *Bundle = getScheduleData(OpValue);
3766 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: cancel scheduling of " <<
*Bundle << "\n"; } } while (false)
;
3767 assert(!Bundle->IsScheduled &&(static_cast <bool> (!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled") ? void (0)
: __assert_fail ("!Bundle->IsScheduled && \"Can't cancel bundle which is already scheduled\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3768, __extension__ __PRETTY_FUNCTION__))
3768 "Can't cancel bundle which is already scheduled")(static_cast <bool> (!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled") ? void (0)
: __assert_fail ("!Bundle->IsScheduled && \"Can't cancel bundle which is already scheduled\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3768, __extension__ __PRETTY_FUNCTION__))
;
3769 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&(static_cast <bool> (Bundle->isSchedulingEntity() &&
Bundle->isPartOfBundle() && "tried to unbundle something which is not a bundle"
) ? void (0) : __assert_fail ("Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && \"tried to unbundle something which is not a bundle\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3770, __extension__ __PRETTY_FUNCTION__))
3770 "tried to unbundle something which is not a bundle")(static_cast <bool> (Bundle->isSchedulingEntity() &&
Bundle->isPartOfBundle() && "tried to unbundle something which is not a bundle"
) ? void (0) : __assert_fail ("Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && \"tried to unbundle something which is not a bundle\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3770, __extension__ __PRETTY_FUNCTION__))
;
3771
3772 // Un-bundle: make single instructions out of the bundle.
3773 ScheduleData *BundleMember = Bundle;
3774 while (BundleMember) {
3775 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links")(static_cast <bool> (BundleMember->FirstInBundle == Bundle
&& "corrupt bundle links") ? void (0) : __assert_fail
("BundleMember->FirstInBundle == Bundle && \"corrupt bundle links\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3775, __extension__ __PRETTY_FUNCTION__))
;
3776 BundleMember->FirstInBundle = BundleMember;
3777 ScheduleData *Next = BundleMember->NextInBundle;
3778 BundleMember->NextInBundle = nullptr;
3779 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
3780 if (BundleMember->UnscheduledDepsInBundle == 0) {
3781 ReadyInsts.insert(BundleMember);
3782 }
3783 BundleMember = Next;
3784 }
3785}
3786
3787BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
3788 // Allocate a new ScheduleData for the instruction.
3789 if (ChunkPos >= ChunkSize) {
3790 ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
3791 ChunkPos = 0;
3792 }
3793 return &(ScheduleDataChunks.back()[ChunkPos++]);
3794}
3795
3796bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
3797 Value *OpValue) {
3798 if (getScheduleData(V, isOneOf(OpValue, V)))
3799 return true;
3800 Instruction *I = dyn_cast<Instruction>(V);
3801 assert(I && "bundle member must be an instruction")(static_cast <bool> (I && "bundle member must be an instruction"
) ? void (0) : __assert_fail ("I && \"bundle member must be an instruction\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3801, __extension__ __PRETTY_FUNCTION__))
;
3802 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled")(static_cast <bool> (!isa<PHINode>(I) && "phi nodes don't need to be scheduled"
) ? void (0) : __assert_fail ("!isa<PHINode>(I) && \"phi nodes don't need to be scheduled\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3802, __extension__ __PRETTY_FUNCTION__))
;
3803 auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
3804 ScheduleData *ISD = getScheduleData(I);
3805 if (!ISD)
3806 return false;
3807 assert(isInSchedulingRegion(ISD) &&(static_cast <bool> (isInSchedulingRegion(ISD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(ISD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3808, __extension__ __PRETTY_FUNCTION__))
3808 "ScheduleData not in scheduling region")(static_cast <bool> (isInSchedulingRegion(ISD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(ISD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3808, __extension__ __PRETTY_FUNCTION__))
;
3809 ScheduleData *SD = allocateScheduleDataChunks();
3810 SD->Inst = I;
3811 SD->init(SchedulingRegionID, OpValue);
3812 ExtraScheduleDataMap[I][OpValue] = SD;
3813 return true;
3814 };
3815 if (CheckSheduleForI(I))
3816 return true;
3817 if (!ScheduleStart) {
3818 // It's the first instruction in the new region.
3819 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
3820 ScheduleStart = I;
3821 ScheduleEnd = I->getNextNode();
3822 if (isOneOf(OpValue, I) != I)
3823 CheckSheduleForI(I);
3824 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?")(static_cast <bool> (ScheduleEnd && "tried to vectorize a TerminatorInst?"
) ? void (0) : __assert_fail ("ScheduleEnd && \"tried to vectorize a TerminatorInst?\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3824, __extension__ __PRETTY_FUNCTION__))
;
3825 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initialize schedule region to "
<< *I << "\n"; } } while (false)
;
3826 return true;
3827 }
3828 // Search up and down at the same time, because we don't know if the new
3829 // instruction is above or below the existing scheduling region.
3830 BasicBlock::reverse_iterator UpIter =
3831 ++ScheduleStart->getIterator().getReverse();
3832 BasicBlock::reverse_iterator UpperEnd = BB->rend();
3833 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
3834 BasicBlock::iterator LowerEnd = BB->end();
3835 while (true) {
3836 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
3837 DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: exceeded schedule region size limit\n"
; } } while (false)
;
3838 return false;
3839 }
3840
3841 if (UpIter != UpperEnd) {
3842 if (&*UpIter == I) {
3843 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
3844 ScheduleStart = I;
3845 if (isOneOf(OpValue, I) != I)
3846 CheckSheduleForI(I);
3847 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region start to "
<< *I << "\n"; } } while (false)
;
3848 return true;
3849 }
3850 UpIter++;
3851 }
3852 if (DownIter != LowerEnd) {
3853 if (&*DownIter == I) {
3854 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
3855 nullptr);
3856 ScheduleEnd = I->getNextNode();
3857 if (isOneOf(OpValue, I) != I)
3858 CheckSheduleForI(I);
3859 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?")(static_cast <bool> (ScheduleEnd && "tried to vectorize a TerminatorInst?"
) ? void (0) : __assert_fail ("ScheduleEnd && \"tried to vectorize a TerminatorInst?\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3859, __extension__ __PRETTY_FUNCTION__))
;
3860 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region end to "
<< *I << "\n"; } } while (false)
;
3861 return true;
3862 }
3863 DownIter++;
3864 }
3865 assert((UpIter != UpperEnd || DownIter != LowerEnd) &&(static_cast <bool> ((UpIter != UpperEnd || DownIter !=
LowerEnd) && "instruction not found in block") ? void
(0) : __assert_fail ("(UpIter != UpperEnd || DownIter != LowerEnd) && \"instruction not found in block\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3866, __extension__ __PRETTY_FUNCTION__))
3866 "instruction not found in block")(static_cast <bool> ((UpIter != UpperEnd || DownIter !=
LowerEnd) && "instruction not found in block") ? void
(0) : __assert_fail ("(UpIter != UpperEnd || DownIter != LowerEnd) && \"instruction not found in block\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3866, __extension__ __PRETTY_FUNCTION__))
;
3867 }
3868 return true;
3869}
3870
3871void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
3872 Instruction *ToI,
3873 ScheduleData *PrevLoadStore,
3874 ScheduleData *NextLoadStore) {
3875 ScheduleData *CurrentLoadStore = PrevLoadStore;
3876 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
3877 ScheduleData *SD = ScheduleDataMap[I];
3878 if (!SD) {
3879 SD = allocateScheduleDataChunks();
3880 ScheduleDataMap[I] = SD;
3881 SD->Inst = I;
3882 }
3883 assert(!isInSchedulingRegion(SD) &&(static_cast <bool> (!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region") ? void (0) :
__assert_fail ("!isInSchedulingRegion(SD) && \"new ScheduleData already in scheduling region\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3884, __extension__ __PRETTY_FUNCTION__))
3884 "new ScheduleData already in scheduling region")(static_cast <bool> (!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region") ? void (0) :
__assert_fail ("!isInSchedulingRegion(SD) && \"new ScheduleData already in scheduling region\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3884, __extension__ __PRETTY_FUNCTION__))
;
3885 SD->init(SchedulingRegionID, I);
3886
3887 if (I->mayReadOrWriteMemory() &&
3888 (!isa<IntrinsicInst>(I) ||
3889 cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
3890 // Update the linked list of memory accessing instructions.
3891 if (CurrentLoadStore) {
3892 CurrentLoadStore->NextLoadStore = SD;
3893 } else {
3894 FirstLoadStoreInRegion = SD;
3895 }
3896 CurrentLoadStore = SD;
3897 }
3898 }
3899 if (NextLoadStore) {
3900 if (CurrentLoadStore)
3901 CurrentLoadStore->NextLoadStore = NextLoadStore;
3902 } else {
3903 LastLoadStoreInRegion = CurrentLoadStore;
3904 }
3905}
3906
3907void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
3908 bool InsertInReadyList,
3909 BoUpSLP *SLP) {
3910 assert(SD->isSchedulingEntity())(static_cast <bool> (SD->isSchedulingEntity()) ? void
(0) : __assert_fail ("SD->isSchedulingEntity()", "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3910, __extension__ __PRETTY_FUNCTION__))
;
3911
3912 SmallVector<ScheduleData *, 10> WorkList;
3913 WorkList.push_back(SD);
3914
3915 while (!WorkList.empty()) {
3916 ScheduleData *SD = WorkList.back();
3917 WorkList.pop_back();
3918
3919 ScheduleData *BundleMember = SD;
3920 while (BundleMember) {
3921 assert(isInSchedulingRegion(BundleMember))(static_cast <bool> (isInSchedulingRegion(BundleMember)
) ? void (0) : __assert_fail ("isInSchedulingRegion(BundleMember)"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3921, __extension__ __PRETTY_FUNCTION__))
;
3922 if (!BundleMember->hasValidDependencies()) {
3923
3924 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: update deps of " <<
*BundleMember << "\n"; } } while (false)
;
3925 BundleMember->Dependencies = 0;
3926 BundleMember->resetUnscheduledDeps();
3927
3928 // Handle def-use chain dependencies.
3929 if (BundleMember->OpValue != BundleMember->Inst) {
3930 ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
3931 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3932 BundleMember->Dependencies++;
3933 ScheduleData *DestBundle = UseSD->FirstInBundle;
3934 if (!DestBundle->IsScheduled)
3935 BundleMember->incrementUnscheduledDeps(1);
3936 if (!DestBundle->hasValidDependencies())
3937 WorkList.push_back(DestBundle);
3938 }
3939 } else {
3940 for (User *U : BundleMember->Inst->users()) {
3941 if (isa<Instruction>(U)) {
3942 ScheduleData *UseSD = getScheduleData(U);
3943 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3944 BundleMember->Dependencies++;
3945 ScheduleData *DestBundle = UseSD->FirstInBundle;
3946 if (!DestBundle->IsScheduled)
3947 BundleMember->incrementUnscheduledDeps(1);
3948 if (!DestBundle->hasValidDependencies())
3949 WorkList.push_back(DestBundle);
3950 }
3951 } else {
3952 // I'm not sure if this can ever happen. But we need to be safe.
3953 // This lets the instruction/bundle never be scheduled and
3954 // eventually disable vectorization.
3955 BundleMember->Dependencies++;
3956 BundleMember->incrementUnscheduledDeps(1);
3957 }
3958 }
3959 }
3960
3961 // Handle the memory dependencies.
3962 ScheduleData *DepDest = BundleMember->NextLoadStore;
3963 if (DepDest) {
3964 Instruction *SrcInst = BundleMember->Inst;
3965 MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
3966 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
3967 unsigned numAliased = 0;
3968 unsigned DistToSrc = 1;
3969
3970 while (DepDest) {
3971 assert(isInSchedulingRegion(DepDest))(static_cast <bool> (isInSchedulingRegion(DepDest)) ? void
(0) : __assert_fail ("isInSchedulingRegion(DepDest)", "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3971, __extension__ __PRETTY_FUNCTION__))
;
3972
3973 // We have two limits to reduce the complexity:
3974 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
3975 // SLP->isAliased (which is the expensive part in this loop).
3976 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
3977 // the whole loop (even if the loop is fast, it's quadratic).
3978 // It's important for the loop break condition (see below) to
3979 // check this limit even between two read-only instructions.
3980 if (DistToSrc >= MaxMemDepDistance ||
3981 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
3982 (numAliased >= AliasedCheckLimit ||
3983 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
3984
3985 // We increment the counter only if the locations are aliased
3986 // (instead of counting all alias checks). This gives a better
3987 // balance between reduced runtime and accurate dependencies.
3988 numAliased++;
3989
3990 DepDest->MemoryDependencies.push_back(BundleMember);
3991 BundleMember->Dependencies++;
3992 ScheduleData *DestBundle = DepDest->FirstInBundle;
3993 if (!DestBundle->IsScheduled) {
3994 BundleMember->incrementUnscheduledDeps(1);
3995 }
3996 if (!DestBundle->hasValidDependencies()) {
3997 WorkList.push_back(DestBundle);
3998 }
3999 }
4000 DepDest = DepDest->NextLoadStore;
4001
4002 // Example, explaining the loop break condition: Let's assume our
4003 // starting instruction is i0 and MaxMemDepDistance = 3.
4004 //
4005 // +--------v--v--v
4006 // i0,i1,i2,i3,i4,i5,i6,i7,i8
4007 // +--------^--^--^
4008 //
4009 // MaxMemDepDistance let us stop alias-checking at i3 and we add
4010 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
4011 // Previously we already added dependencies from i3 to i6,i7,i8
4012 // (because of MaxMemDepDistance). As we added a dependency from
4013 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
4014 // and we can abort this loop at i6.
4015 if (DistToSrc >= 2 * MaxMemDepDistance)
4016 break;
4017 DistToSrc++;
4018 }
4019 }
4020 }
4021 BundleMember = BundleMember->NextInBundle;
4022 }
4023 if (InsertInReadyList && SD->isReady()) {
4024 ReadyInsts.push_back(SD);
4025 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready on update: " <<
*SD->Inst << "\n"; } } while (false)
;
4026 }
4027 }
4028}
4029
4030void BoUpSLP::BlockScheduling::resetSchedule() {
4031 assert(ScheduleStart &&(static_cast <bool> (ScheduleStart && "tried to reset schedule on block which has not been scheduled"
) ? void (0) : __assert_fail ("ScheduleStart && \"tried to reset schedule on block which has not been scheduled\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4032, __extension__ __PRETTY_FUNCTION__))
4032 "tried to reset schedule on block which has not been scheduled")(static_cast <bool> (ScheduleStart && "tried to reset schedule on block which has not been scheduled"
) ? void (0) : __assert_fail ("ScheduleStart && \"tried to reset schedule on block which has not been scheduled\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4032, __extension__ __PRETTY_FUNCTION__))
;
4033 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4034 doForAllOpcodes(I, [&](ScheduleData *SD) {
4035 assert(isInSchedulingRegion(SD) &&(static_cast <bool> (isInSchedulingRegion(SD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(SD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4036, __extension__ __PRETTY_FUNCTION__))
4036 "ScheduleData not in scheduling region")(static_cast <bool> (isInSchedulingRegion(SD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(SD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4036, __extension__ __PRETTY_FUNCTION__))
;
4037 SD->IsScheduled = false;
4038 SD->resetUnscheduledDeps();
4039 });
4040 }
4041 ReadyInsts.clear();
4042}
4043
4044void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
4045 if (!BS->ScheduleStart)
4046 return;
4047
4048 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule block " << BS
->BB->getName() << "\n"; } } while (false)
;
4049
4050 BS->resetSchedule();
4051
4052 // For the real scheduling we use a more sophisticated ready-list: it is
4053 // sorted by the original instruction location. This lets the final schedule
4054 // be as close as possible to the original instruction order.
4055 struct ScheduleDataCompare {
4056 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
4057 return SD2->SchedulingPriority < SD1->SchedulingPriority;
4058 }
4059 };
4060 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
4061
4062 // Ensure that all dependency data is updated and fill the ready-list with
4063 // initial instructions.
4064 int Idx = 0;
4065 int NumToSchedule = 0;
4066 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
4067 I = I->getNextNode()) {
4068 BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
4069 assert(SD->isPartOfBundle() ==(static_cast <bool> (SD->isPartOfBundle() == (getTreeEntry
(SD->Inst) != nullptr) && "scheduler and vectorizer bundle mismatch"
) ? void (0) : __assert_fail ("SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && \"scheduler and vectorizer bundle mismatch\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4071, __extension__ __PRETTY_FUNCTION__))
4070 (getTreeEntry(SD->Inst) != nullptr) &&(static_cast <bool> (SD->isPartOfBundle() == (getTreeEntry
(SD->Inst) != nullptr) && "scheduler and vectorizer bundle mismatch"
) ? void (0) : __assert_fail ("SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && \"scheduler and vectorizer bundle mismatch\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4071, __extension__ __PRETTY_FUNCTION__))
4071 "scheduler and vectorizer bundle mismatch")(static_cast <bool> (SD->isPartOfBundle() == (getTreeEntry
(SD->Inst) != nullptr) && "scheduler and vectorizer bundle mismatch"
) ? void (0) : __assert_fail ("SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && \"scheduler and vectorizer bundle mismatch\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4071, __extension__ __PRETTY_FUNCTION__))
;
4072 SD->FirstInBundle->SchedulingPriority = Idx++;
4073 if (SD->isSchedulingEntity()) {
4074 BS->calculateDependencies(SD, false, this);
4075 NumToSchedule++;
4076 }
4077 });
4078 }
4079 BS->initialFillReadyList(ReadyInsts);
4080
4081 Instruction *LastScheduledInst = BS->ScheduleEnd;
4082
4083 // Do the "real" scheduling.
4084 while (!ReadyInsts.empty()) {
4085 ScheduleData *picked = *ReadyInsts.begin();
4086 ReadyInsts.erase(ReadyInsts.begin());
4087
4088 // Move the scheduled instruction(s) to their dedicated places, if not
4089 // there yet.
4090 ScheduleData *BundleMember = picked;
4091 while (BundleMember) {
4092 Instruction *pickedInst = BundleMember->Inst;
4093 if (LastScheduledInst->getNextNode() != pickedInst) {
4094 BS->BB->getInstList().remove(pickedInst);
4095 BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
4096 pickedInst);
4097 }
4098 LastScheduledInst = pickedInst;
4099 BundleMember = BundleMember->NextInBundle;
4100 }
4101
4102 BS->schedule(picked, ReadyInsts);
4103 NumToSchedule--;
4104 }
4105 assert(NumToSchedule == 0 && "could not schedule all instructions")(static_cast <bool> (NumToSchedule == 0 && "could not schedule all instructions"
) ? void (0) : __assert_fail ("NumToSchedule == 0 && \"could not schedule all instructions\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4105, __extension__ __PRETTY_FUNCTION__))
;
4106
4107 // Avoid duplicate scheduling of the block.
4108 BS->ScheduleStart = nullptr;
4109}
4110
4111unsigned BoUpSLP::getVectorElementSize(Value *V) {
4112 // If V is a store, just return the width of the stored value without
4113 // traversing the expression tree. This is the common case.
4114 if (auto *Store = dyn_cast<StoreInst>(V))
4115 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
4116
4117 // If V is not a store, we can traverse the expression tree to find loads
4118 // that feed it. The type of the loaded value may indicate a more suitable
4119 // width than V's type. We want to base the vector element size on the width
4120 // of memory operations where possible.
4121 SmallVector<Instruction *, 16> Worklist;
4122 SmallPtrSet<Instruction *, 16> Visited;
4123 if (auto *I = dyn_cast<Instruction>(V))
4124 Worklist.push_back(I);
4125
4126 // Traverse the expression tree in bottom-up order looking for loads. If we
4127 // encounter an instruciton we don't yet handle, we give up.
4128 auto MaxWidth = 0u;
4129 auto FoundUnknownInst = false;
4130 while (!Worklist.empty() && !FoundUnknownInst) {
4131 auto *I = Worklist.pop_back_val();
4132 Visited.insert(I);
4133
4134 // We should only be looking at scalar instructions here. If the current
4135 // instruction has a vector type, give up.
4136 auto *Ty = I->getType();
4137 if (isa<VectorType>(Ty))
4138 FoundUnknownInst = true;
4139
4140 // If the current instruction is a load, update MaxWidth to reflect the
4141 // width of the loaded value.
4142 else if (isa<LoadInst>(I))
4143 MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
4144
4145 // Otherwise, we need to visit the operands of the instruction. We only
4146 // handle the interesting cases from buildTree here. If an operand is an
4147 // instruction we haven't yet visited, we add it to the worklist.
4148 else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
4149 isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
4150 for (Use &U : I->operands())
4151 if (auto *J = dyn_cast<Instruction>(U.get()))
4152 if (!Visited.count(J))
4153 Worklist.push_back(J);
4154 }
4155
4156 // If we don't yet handle the instruction, give up.
4157 else
4158 FoundUnknownInst = true;
4159 }
4160
4161 // If we didn't encounter a memory access in the expression tree, or if we
4162 // gave up for some reason, just return the width of V.
4163 if (!MaxWidth || FoundUnknownInst)
4164 return DL->getTypeSizeInBits(V->getType());
4165
4166 // Otherwise, return the maximum width we found.
4167 return MaxWidth;
4168}
4169
4170// Determine if a value V in a vectorizable expression Expr can be demoted to a
4171// smaller type with a truncation. We collect the values that will be demoted
4172// in ToDemote and additional roots that require investigating in Roots.
4173static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
4174 SmallVectorImpl<Value *> &ToDemote,
4175 SmallVectorImpl<Value *> &Roots) {
4176 // We can always demote constants.
4177 if (isa<Constant>(V)) {
4178 ToDemote.push_back(V);
4179 return true;
4180 }
4181
4182 // If the value is not an instruction in the expression with only one use, it
4183 // cannot be demoted.
4184 auto *I = dyn_cast<Instruction>(V);
4185 if (!I || !I->hasOneUse() || !Expr.count(I))
4186 return false;
4187
4188 switch (I->getOpcode()) {
4189
4190 // We can always demote truncations and extensions. Since truncations can
4191 // seed additional demotion, we save the truncated value.
4192 case Instruction::Trunc:
4193 Roots.push_back(I->getOperand(0));
4194 break;
4195 case Instruction::ZExt:
4196 case Instruction::SExt:
4197 break;
4198
4199 // We can demote certain binary operations if we can demote both of their
4200 // operands.
4201 case Instruction::Add:
4202 case Instruction::Sub:
4203 case Instruction::Mul:
4204 case Instruction::And:
4205 case Instruction::Or:
4206 case Instruction::Xor:
4207 if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
4208 !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
4209 return false;
4210 break;
4211
4212 // We can demote selects if we can demote their true and false values.
4213 case Instruction::Select: {
4214 SelectInst *SI = cast<SelectInst>(I);
4215 if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
4216 !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
4217 return false;
4218 break;
4219 }
4220
4221 // We can demote phis if we can demote all their incoming operands. Note that
4222 // we don't need to worry about cycles since we ensure single use above.
4223 case Instruction::PHI: {
4224 PHINode *PN = cast<PHINode>(I);
4225 for (Value *IncValue : PN->incoming_values())
4226 if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
4227 return false;
4228 break;
4229 }
4230
4231 // Otherwise, conservatively give up.
4232 default:
4233 return false;
4234 }
4235
4236 // Record the value that we can demote.
4237 ToDemote.push_back(V);
4238 return true;
4239}
4240
4241void BoUpSLP::computeMinimumValueSizes() {
4242 // If there are no external uses, the expression tree must be rooted by a
4243 // store. We can't demote in-memory values, so there is nothing to do here.
4244 if (ExternalUses.empty())
4245 return;
4246
4247 // We only attempt to truncate integer expressions.
4248 auto &TreeRoot = VectorizableTree[0].Scalars;
4249 auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
4250 if (!TreeRootIT)
4251 return;
4252
4253 // If the expression is not rooted by a store, these roots should have
4254 // external uses. We will rely on InstCombine to rewrite the expression in
4255 // the narrower type. However, InstCombine only rewrites single-use values.
4256 // This means that if a tree entry other than a root is used externally, it
4257 // must have multiple uses and InstCombine will not rewrite it. The code
4258 // below ensures that only the roots are used externally.
4259 SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
4260 for (auto &EU : ExternalUses)
4261 if (!Expr.erase(EU.Scalar))
4262 return;
4263 if (!Expr.empty())
4264 return;
4265
4266 // Collect the scalar values of the vectorizable expression. We will use this
4267 // context to determine which values can be demoted. If we see a truncation,
4268 // we mark it as seeding another demotion.
4269 for (auto &Entry : VectorizableTree)
4270 Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
4271
4272 // Ensure the roots of the vectorizable tree don't form a cycle. They must
4273 // have a single external user that is not in the vectorizable tree.
4274 for (auto *Root : TreeRoot)
4275 if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
4276 return;
4277
4278 // Conservatively determine if we can actually truncate the roots of the
4279 // expression. Collect the values that can be demoted in ToDemote and
4280 // additional roots that require investigating in Roots.
4281 SmallVector<Value *, 32> ToDemote;
4282 SmallVector<Value *, 4> Roots;
4283 for (auto *Root : TreeRoot) {
4284 // Do not include top zext/sext/trunc operations to those to be demoted, it
4285 // produces noise cast<vect>, trunc <vect>, exctract <vect>, cast <extract>
4286 // sequence.
4287 if (isa<Constant>(Root))
4288 continue;
4289 auto *I = dyn_cast<Instruction>(Root);
4290 if (!I || !I->hasOneUse() || !Expr.count(I))
4291 return;
4292 if (isa<ZExtInst>(I) || isa<SExtInst>(I))
4293 continue;
4294 if (auto *TI = dyn_cast<TruncInst>(I)) {
4295 Roots.push_back(TI->getOperand(0));
4296 continue;
4297 }
4298 if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
4299 return;
4300 }
4301
4302 // The maximum bit width required to represent all the values that can be
4303 // demoted without loss of precision. It would be safe to truncate the roots
4304 // of the expression to this width.
4305 auto MaxBitWidth = 8u;
4306
4307 // We first check if all the bits of the roots are demanded. If they're not,
4308 // we can truncate the roots to this narrower type.
4309 for (auto *Root : TreeRoot) {
4310 auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
4311 MaxBitWidth = std::max<unsigned>(
4312 Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
4313 }
4314
4315 // True if the roots can be zero-extended back to their original type, rather
4316 // than sign-extended. We know that if the leading bits are not demanded, we
4317 // can safely zero-extend. So we initialize IsKnownPositive to True.
4318 bool IsKnownPositive = true;
4319
4320 // If all the bits of the roots are demanded, we can try a little harder to
4321 // compute a narrower type. This can happen, for example, if the roots are
4322 // getelementptr indices. InstCombine promotes these indices to the pointer
4323 // width. Thus, all their bits are technically demanded even though the
4324 // address computation might be vectorized in a smaller type.
4325 //
4326 // We start by looking at each entry that can be demoted. We compute the
4327 // maximum bit width required to store the scalar by using ValueTracking to
4328 // compute the number of high-order bits we can truncate.
4329 if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
4330 MaxBitWidth = 8u;
4331
4332 // Determine if the sign bit of all the roots is known to be zero. If not,
4333 // IsKnownPositive is set to False.
4334 IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
4335 KnownBits Known = computeKnownBits(R, *DL);
4336 return Known.isNonNegative();
4337 });
4338
4339 // Determine the maximum number of bits required to store the scalar
4340 // values.
4341 for (auto *Scalar : ToDemote) {
4342 auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
4343 auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
4344 MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
4345 }
4346
4347 // If we can't prove that the sign bit is zero, we must add one to the
4348 // maximum bit width to account for the unknown sign bit. This preserves
4349 // the existing sign bit so we can safely sign-extend the root back to the
4350 // original type. Otherwise, if we know the sign bit is zero, we will
4351 // zero-extend the root instead.
4352 //
4353 // FIXME: This is somewhat suboptimal, as there will be cases where adding
4354 // one to the maximum bit width will yield a larger-than-necessary
4355 // type. In general, we need to add an extra bit only if we can't
4356 // prove that the upper bit of the original type is equal to the
4357 // upper bit of the proposed smaller type. If these two bits are the
4358 // same (either zero or one) we know that sign-extending from the
4359 // smaller type will result in the same value. Here, since we can't
4360 // yet prove this, we are just making the proposed smaller type
4361 // larger to ensure correctness.
4362 if (!IsKnownPositive)
4363 ++MaxBitWidth;
4364 }
4365
4366 // Round MaxBitWidth up to the next power-of-two.
4367 if (!isPowerOf2_64(MaxBitWidth))
4368 MaxBitWidth = NextPowerOf2(MaxBitWidth);
4369
4370 // If the maximum bit width we compute is less than the with of the roots'
4371 // type, we can proceed with the narrowing. Otherwise, do nothing.
4372 if (MaxBitWidth >= TreeRootIT->getBitWidth())
4373 return;
4374
4375 // If we can truncate the root, we must collect additional values that might
4376 // be demoted as a result. That is, those seeded by truncations we will
4377 // modify.
4378 while (!Roots.empty())
4379 collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
4380
4381 // Finally, map the values we can demote to the maximum bit with we computed.
4382 for (auto *Scalar : ToDemote)
4383 MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
4384}
4385
4386namespace {
4387
4388/// The SLPVectorizer Pass.
4389struct SLPVectorizer : public FunctionPass {
4390 SLPVectorizerPass Impl;
4391
4392 /// Pass identification, replacement for typeid
4393 static char ID;
4394
4395 explicit SLPVectorizer() : FunctionPass(ID) {
4396 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
4397 }
4398
4399 bool doInitialization(Module &M) override {
4400 return false;
4401 }
4402
4403 bool runOnFunction(Function &F) override {
4404 if (skipFunction(F))
4405 return false;
4406
4407 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
4408 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
4409 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
4410 auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
4411 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
4412 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
4413 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
4414 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
4415 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
4416 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
4417
4418 return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
4419 }
4420
4421 void getAnalysisUsage(AnalysisUsage &AU) const override {
4422 FunctionPass::getAnalysisUsage(AU);
4423 AU.addRequired<AssumptionCacheTracker>();
4424 AU.addRequired<ScalarEvolutionWrapperPass>();
4425 AU.addRequired<AAResultsWrapperPass>();
4426 AU.addRequired<TargetTransformInfoWrapperPass>();
4427 AU.addRequired<LoopInfoWrapperPass>();
4428 AU.addRequired<DominatorTreeWrapperPass>();
4429 AU.addRequired<DemandedBitsWrapperPass>();
4430 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
4431 AU.addPreserved<LoopInfoWrapperPass>();
4432 AU.addPreserved<DominatorTreeWrapperPass>();
4433 AU.addPreserved<AAResultsWrapperPass>();
4434 AU.addPreserved<GlobalsAAWrapperPass>();
4435 AU.setPreservesCFG();
4436 }
4437};
4438
4439} // end anonymous namespace
4440
4441PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
4442 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
4443 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
4444 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
4445 auto *AA = &AM.getResult<AAManager>(F);
4446 auto *LI = &AM.getResult<LoopAnalysis>(F);
4447 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
4448 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
4449 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
4450 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
4451
4452 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
1
Calling 'SLPVectorizerPass::runImpl'
4453 if (!Changed)
4454 return PreservedAnalyses::all();
4455
4456 PreservedAnalyses PA;
4457 PA.preserveSet<CFGAnalyses>();
4458 PA.preserve<AAManager>();
4459 PA.preserve<GlobalsAA>();
4460 return PA;
4461}
4462
4463bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
4464 TargetTransformInfo *TTI_,
4465 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
4466 LoopInfo *LI_, DominatorTree *DT_,
4467 AssumptionCache *AC_, DemandedBits *DB_,
4468 OptimizationRemarkEmitter *ORE_) {
4469 SE = SE_;
4470 TTI = TTI_;
4471 TLI = TLI_;
4472 AA = AA_;
4473 LI = LI_;
4474 DT = DT_;
4475 AC = AC_;
4476 DB = DB_;
4477 DL = &F.getParent()->getDataLayout();
4478
4479 Stores.clear();
4480 GEPs.clear();
4481 bool Changed = false;
4482
4483 // If the target claims to have no vector registers don't attempt
4484 // vectorization.
4485 if (!TTI->getNumberOfRegisters(true))
2
Assuming the condition is false
3
Taking false branch
4486 return false;
4487
4488 // Don't vectorize when the attribute NoImplicitFloat is used.
4489 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
4
Assuming the condition is false
5
Taking false branch
4490 return false;
4491
4492 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing blocks in " <<
F.getName() << ".\n"; } } while (false)
;
4493
4494 // Use the bottom up slp vectorizer to construct chains that start with
4495 // store instructions.
4496 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
4497
4498 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
4499 // delete instructions.
4500
4501 // Scan the blocks in the function in post order.
4502 for (auto BB : post_order(&F.getEntryBlock())) {
4503 collectSeedInstructions(BB);
4504
4505 // Vectorize trees that end at stores.
4506 if (!Stores.empty()) {
6
Assuming the condition is false
7
Taking false branch
10
Assuming the condition is false
11
Taking false branch
14
Assuming the condition is false
15
Taking false branch
4507 DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found stores for " << Stores
.size() << " underlying objects.\n"; } } while (false)
4508 << " underlying objects.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found stores for " << Stores
.size() << " underlying objects.\n"; } } while (false)
;
4509 Changed |= vectorizeStoreChains(R);
4510 }
4511
4512 // Vectorize trees that end at reductions.
4513 Changed |= vectorizeChainsInBlock(BB, R);
4514
4515 // Vectorize the index computations of getelementptr instructions. This
4516 // is primarily intended to catch gather-like idioms ending at
4517 // non-consecutive loads.
4518 if (!GEPs.empty()) {
8
Assuming the condition is false
9
Taking false branch
12
Assuming the condition is false
13
Taking false branch
16
Assuming the condition is false
17
Taking false branch
4519 DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found GEPs for " << GEPs
.size() << " underlying objects.\n"; } } while (false)
4520 << " underlying objects.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found GEPs for " << GEPs
.size() << " underlying objects.\n"; } } while (false)
;
4521 Changed |= vectorizeGEPIndices(BB, R);
4522 }
4523 }
4524
4525 if (Changed) {
18
Assuming 'Changed' is not equal to 0
19
Taking true branch
4526 R.optimizeGatherSequence();
20
Calling 'BoUpSLP::optimizeGatherSequence'
4527 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: vectorized \"" << F.getName
() << "\"\n"; } } while (false)
;
4528 DEBUG(verifyFunction(F))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { verifyFunction(F); } } while (false)
;
4529 }
4530 return Changed;
4531}
4532
4533/// \brief Check that the Values in the slice in VL array are still existent in
4534/// the WeakTrackingVH array.
4535/// Vectorization of part of the VL array may cause later values in the VL array
4536/// to become invalid. We track when this has happened in the WeakTrackingVH
4537/// array.
4538static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
4539 ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
4540 unsigned SliceSize) {
4541 VL = VL.slice(SliceBegin, SliceSize);
4542 VH = VH.slice(SliceBegin, SliceSize);
4543 return !std::equal(VL.begin(), VL.end(), VH.begin());
4544}
4545
4546bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
4547 unsigned VecRegSize) {
4548 const unsigned ChainLen = Chain.size();
4549 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLendo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< ChainLen << "\n"; } } while (false)
4550 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< ChainLen << "\n"; } } while (false)
;
4551 const unsigned Sz = R.getVectorElementSize(Chain[0]);
4552 const unsigned VF = VecRegSize / Sz;
4553
4554 if (!isPowerOf2_32(Sz) || VF < 2)
4555 return false;
4556
4557 // Keep track of values that were deleted by vectorizing in the loop below.
4558 const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
4559
4560 bool Changed = false;
4561 // Look for profitable vectorizable trees at all offsets, starting at zero.
4562 for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
4563
4564 // Check that a previous iteration of this loop did not delete the Value.
4565 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
4566 continue;
4567
4568 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << VF <<
" stores at offset " << i << "\n"; } } while (false
)
4569 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << VF <<
" stores at offset " << i << "\n"; } } while (false
)
;
4570 ArrayRef<Value *> Operands = Chain.slice(i, VF);
4571
4572 R.buildTree(Operands);
4573 if (R.isTreeTinyAndNotFullyVectorizable())
4574 continue;
4575
4576 R.computeMinimumValueSizes();
4577
4578 int Cost = R.getTreeCost();
4579
4580 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found cost=" << Cost <<
" for VF=" << VF << "\n"; } } while (false)
;
4581 if (Cost < -SLPCostThreshold) {
4582 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Decided to vectorize cost=" <<
Cost << "\n"; } } while (false)
;
4583
4584 using namespace ore;
4585
4586 R.getORE()->emit(OptimizationRemark(SV_NAME"slp-vectorizer", "StoresVectorized",
4587 cast<StoreInst>(Chain[i]))
4588 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
4589 << " and with tree size "
4590 << NV("TreeSize", R.getTreeSize()));
4591
4592 R.vectorizeTree();
4593
4594 // Move to the next bundle.
4595 i += VF - 1;
4596 Changed = true;
4597 }
4598 }
4599
4600 return Changed;
4601}
4602
4603bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
4604 BoUpSLP &R) {
4605 SetVector<StoreInst *> Heads;
4606 SmallDenseSet<StoreInst *> Tails;
4607 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
4608
4609 // We may run into multiple chains that merge into a single chain. We mark the
4610 // stores that we vectorized so that we don't visit the same store twice.
4611 BoUpSLP::ValueSet VectorizedStores;
4612 bool Changed = false;
4613
4614 // Do a quadratic search on all of the given stores in reverse order and find
4615 // all of the pairs of stores that follow each other.
4616 SmallVector<unsigned, 16> IndexQueue;
4617 unsigned E = Stores.size();
4618 IndexQueue.resize(E - 1);
4619 for (unsigned I = E; I > 0; --I) {
4620 unsigned Idx = I - 1;
4621 // If a store has multiple consecutive store candidates, search Stores
4622 // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
4623 // This is because usually pairing with immediate succeeding or preceding
4624 // candidate create the best chance to find slp vectorization opportunity.
4625 unsigned Offset = 1;
4626 unsigned Cnt = 0;
4627 for (unsigned J = 0; J < E - 1; ++J, ++Offset) {
4628 if (Idx >= Offset) {
4629 IndexQueue[Cnt] = Idx - Offset;
4630 ++Cnt;
4631 }
4632 if (Idx + Offset < E) {
4633 IndexQueue[Cnt] = Idx + Offset;
4634 ++Cnt;
4635 }
4636 }
4637
4638 for (auto K : IndexQueue) {
4639 if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) {
4640 Tails.insert(Stores[Idx]);
4641 Heads.insert(Stores[K]);
4642 ConsecutiveChain[Stores[K]] = Stores[Idx];
4643 break;
4644 }
4645 }
4646 }
4647
4648 // For stores that start but don't end a link in the chain:
4649 for (auto *SI : llvm::reverse(Heads)) {
4650 if (Tails.count(SI))
4651 continue;
4652
4653 // We found a store instr that starts a chain. Now follow the chain and try
4654 // to vectorize it.
4655 BoUpSLP::ValueList Operands;
4656 StoreInst *I = SI;
4657 // Collect the chain into a list.
4658 while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {
4659 Operands.push_back(I);
4660 // Move to the next value in the chain.
4661 I = ConsecutiveChain[I];
4662 }
4663
4664 // FIXME: Is division-by-2 the correct step? Should we assert that the
4665 // register size is a power-of-2?
4666 for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
4667 Size /= 2) {
4668 if (vectorizeStoreChain(Operands, R, Size)) {
4669 // Mark the vectorized stores so that we don't vectorize them again.
4670 VectorizedStores.insert(Operands.begin(), Operands.end());
4671 Changed = true;
4672 break;
4673 }
4674 }
4675 }
4676
4677 return Changed;
4678}
4679
4680void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
4681 // Initialize the collections. We will make a single pass over the block.
4682 Stores.clear();
4683 GEPs.clear();
4684
4685 // Visit the store and getelementptr instructions in BB and organize them in
4686 // Stores and GEPs according to the underlying objects of their pointer
4687 // operands.
4688 for (Instruction &I : *BB) {
4689 // Ignore store instructions that are volatile or have a pointer operand
4690 // that doesn't point to a scalar type.
4691 if (auto *SI = dyn_cast<StoreInst>(&I)) {
4692 if (!SI->isSimple())
4693 continue;
4694 if (!isValidElementType(SI->getValueOperand()->getType()))
4695 continue;
4696 Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
4697 }
4698
4699 // Ignore getelementptr instructions that have more than one index, a
4700 // constant index, or a pointer operand that doesn't point to a scalar
4701 // type.
4702 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
4703 auto Idx = GEP->idx_begin()->get();
4704 if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
4705 continue;
4706 if (!isValidElementType(Idx->getType()))
4707 continue;
4708 if (GEP->getType()->isVectorTy())
4709 continue;
4710 GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
4711 }
4712 }
4713}
4714
4715bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
4716 if (!A || !B)
4717 return false;
4718 Value *VL[] = { A, B };
4719 return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
4720}
4721
4722bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
4723 int UserCost, bool AllowReorder) {
4724 if (VL.size() < 2)
4725 return false;
4726
4727 DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n"; } } while (false)
4728 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n"; } } while (false)
;
4729
4730 // Check that all of the parts are scalar instructions of the same type.
4731 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
4732 if (!I0)
4733 return false;
4734
4735 unsigned Opcode0 = I0->getOpcode();
4736
4737 unsigned Sz = R.getVectorElementSize(I0);
4738 unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
4739 unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
4740 if (MaxVF < 2) {
4741 R.getORE()->emit([&]() {
4742 return OptimizationRemarkMissed(
4743 SV_NAME"slp-vectorizer", "SmallVF", I0)
4744 << "Cannot SLP vectorize list: vectorization factor "
4745 << "less than 2 is not supported";
4746 });
4747 return false;
4748 }
4749
4750 for (Value *V : VL) {
4751 Type *Ty = V->getType();
4752 if (!isValidElementType(Ty)) {
4753 // NOTE: the following will give user internal llvm type name, which may not be useful
4754 R.getORE()->emit([&]() {
4755 std::string type_str;
4756 llvm::raw_string_ostream rso(type_str);
4757 Ty->print(rso);
4758 return OptimizationRemarkMissed(
4759 SV_NAME"slp-vectorizer", "UnsupportedType", I0)
4760 << "Cannot SLP vectorize list: type "
4761 << rso.str() + " is unsupported by vectorizer";
4762 });
4763 return false;
4764 }
4765 Instruction *Inst = dyn_cast<Instruction>(V);
4766
4767 if (!Inst)
4768 return false;
4769 if (Inst->getOpcode() != Opcode0) {
4770 R.getORE()->emit([&]() {
4771 return OptimizationRemarkMissed(
4772 SV_NAME"slp-vectorizer", "InequableTypes", I0)
4773 << "Cannot SLP vectorize list: not all of the "
4774 << "parts of scalar instructions are of the same type: "
4775 << ore::NV("Instruction1Opcode", I0) << " and "
4776 << ore::NV("Instruction2Opcode", Inst);
4777 });
4778 return false;
4779 }
4780 }
4781
4782 bool Changed = false;
4783 bool CandidateFound = false;
4784 int MinCost = SLPCostThreshold;
4785
4786 // Keep track of values that were deleted by vectorizing in the loop below.
4787 SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
4788
4789 unsigned NextInst = 0, MaxInst = VL.size();
4790 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
4791 VF /= 2) {
4792 // No actual vectorization should happen, if number of parts is the same as
4793 // provided vectorization factor (i.e. the scalar type is used for vector
4794 // code during codegen).
4795 auto *VecTy = VectorType::get(VL[0]->getType(), VF);
4796 if (TTI->getNumberOfParts(VecTy) == VF)
4797 continue;
4798 for (unsigned I = NextInst; I < MaxInst; ++I) {
4799 unsigned OpsWidth = 0;
4800
4801 if (I + VF > MaxInst)
4802 OpsWidth = MaxInst - I;
4803 else
4804 OpsWidth = VF;
4805
4806 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
4807 break;
4808
4809 // Check that a previous iteration of this loop did not delete the Value.
4810 if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
4811 continue;
4812
4813 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << OpsWidth
<< " operations " << "\n"; } } while (false)
4814 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << OpsWidth
<< " operations " << "\n"; } } while (false)
;
4815 ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
4816
4817 R.buildTree(Ops);
4818 // TODO: check if we can allow reordering for more cases.
4819 if (AllowReorder && R.shouldReorder()) {
4820 // Conceptually, there is nothing actually preventing us from trying to
4821 // reorder a larger list. In fact, we do exactly this when vectorizing
4822 // reductions. However, at this point, we only expect to get here when
4823 // there are exactly two operations.
4824 assert(Ops.size() == 2)(static_cast <bool> (Ops.size() == 2) ? void (0) : __assert_fail
("Ops.size() == 2", "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4824, __extension__ __PRETTY_FUNCTION__))
;
4825 Value *ReorderedOps[] = {Ops[1], Ops[0]};
4826 R.buildTree(ReorderedOps, None);
4827 }
4828 if (R.isTreeTinyAndNotFullyVectorizable())
4829 continue;
4830
4831 R.computeMinimumValueSizes();
4832 int Cost = R.getTreeCost() - UserCost;
4833 CandidateFound = true;
4834 MinCost = std::min(MinCost, Cost);
4835
4836 if (Cost < -SLPCostThreshold) {
4837 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing list at cost:" <<
Cost << ".\n"; } } while (false)
;
4838 R.getORE()->emit(OptimizationRemark(SV_NAME"slp-vectorizer", "VectorizedList",
4839 cast<Instruction>(Ops[0]))
4840 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
4841 << " and with tree size "
4842 << ore::NV("TreeSize", R.getTreeSize()));
4843
4844 R.vectorizeTree();
4845 // Move to the next bundle.
4846 I += VF - 1;
4847 NextInst = I + 1;
4848 Changed = true;
4849 }
4850 }
4851 }
4852
4853 if (!Changed && CandidateFound) {
4854 R.getORE()->emit([&]() {
4855 return OptimizationRemarkMissed(
4856 SV_NAME"slp-vectorizer", "NotBeneficial", I0)
4857 << "List vectorization was possible but not beneficial with cost "
4858 << ore::NV("Cost", MinCost) << " >= "
4859 << ore::NV("Treshold", -SLPCostThreshold);
4860 });
4861 } else if (!Changed) {
4862 R.getORE()->emit([&]() {
4863 return OptimizationRemarkMissed(
4864 SV_NAME"slp-vectorizer", "NotPossible", I0)
4865 << "Cannot SLP vectorize list: vectorization was impossible"
4866 << " with available vectorization factors";
4867 });
4868 }
4869 return Changed;
4870}
4871
4872bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
4873 if (!I)
4874 return false;
4875
4876 if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
4877 return false;
4878
4879 Value *P = I->getParent();
4880
4881 // Vectorize in current basic block only.
4882 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
4883 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
4884 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
4885 return false;
4886
4887 // Try to vectorize V.
4888 if (tryToVectorizePair(Op0, Op1, R))
4889 return true;
4890
4891 auto *A = dyn_cast<BinaryOperator>(Op0);
4892 auto *B = dyn_cast<BinaryOperator>(Op1);
4893 // Try to skip B.
4894 if (B && B->hasOneUse()) {
4895 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
4896 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
4897 if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
4898 return true;
4899 if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
4900 return true;
4901 }
4902
4903 // Try to skip A.
4904 if (A && A->hasOneUse()) {
4905 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
4906 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
4907 if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
4908 return true;
4909 if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
4910 return true;
4911 }
4912 return false;
4913}
4914
4915/// \brief Generate a shuffle mask to be used in a reduction tree.
4916///
4917/// \param VecLen The length of the vector to be reduced.
4918/// \param NumEltsToRdx The number of elements that should be reduced in the
4919/// vector.
4920/// \param IsPairwise Whether the reduction is a pairwise or splitting
4921/// reduction. A pairwise reduction will generate a mask of
4922/// <0,2,...> or <1,3,..> while a splitting reduction will generate
4923/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
4924/// \param IsLeft True will generate a mask of even elements, odd otherwise.
4925static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
4926 bool IsPairwise, bool IsLeft,
4927 IRBuilder<> &Builder) {
4928 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask")(static_cast <bool> ((IsPairwise || !IsLeft) &&
"Don't support a <0,1,undef,...> mask") ? void (0) : __assert_fail
("(IsPairwise || !IsLeft) && \"Don't support a <0,1,undef,...> mask\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4928, __extension__ __PRETTY_FUNCTION__))
;
4929
4930 SmallVector<Constant *, 32> ShuffleMask(
4931 VecLen, UndefValue::get(Builder.getInt32Ty()));
4932
4933 if (IsPairwise)
4934 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
4935 for (unsigned i = 0; i != NumEltsToRdx; ++i)
4936 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
4937 else
4938 // Move the upper half of the vector to the lower half.
4939 for (unsigned i = 0; i != NumEltsToRdx; ++i)
4940 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
4941
4942 return ConstantVector::get(ShuffleMask);
4943}
4944
4945namespace {
4946
4947/// Model horizontal reductions.
4948///
4949/// A horizontal reduction is a tree of reduction operations (currently add and
4950/// fadd) that has operations that can be put into a vector as its leaf.
4951/// For example, this tree:
4952///
4953/// mul mul mul mul
4954/// \ / \ /
4955/// + +
4956/// \ /
4957/// +
4958/// This tree has "mul" as its reduced values and "+" as its reduction
4959/// operations. A reduction might be feeding into a store or a binary operation
4960/// feeding a phi.
4961/// ...
4962/// \ /
4963/// +
4964/// |
4965/// phi +=
4966///
4967/// Or:
4968/// ...
4969/// \ /
4970/// +
4971/// |
4972/// *p =
4973///
4974class HorizontalReduction {
4975 using ReductionOpsType = SmallVector<Value *, 16>;
4976 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
4977 ReductionOpsListType ReductionOps;
4978 SmallVector<Value *, 32> ReducedVals;
4979 // Use map vector to make stable output.
4980 MapVector<Instruction *, Value *> ExtraArgs;
4981
4982 /// Kind of the reduction data.
4983 enum ReductionKind {
4984 RK_None, /// Not a reduction.
4985 RK_Arithmetic, /// Binary reduction data.
4986 RK_Min, /// Minimum reduction data.
4987 RK_UMin, /// Unsigned minimum reduction data.
4988 RK_Max, /// Maximum reduction data.
4989 RK_UMax, /// Unsigned maximum reduction data.
4990 };
4991
4992 /// Contains info about operation, like its opcode, left and right operands.
4993 class OperationData {
4994 /// Opcode of the instruction.
4995 unsigned Opcode = 0;
4996
4997 /// Left operand of the reduction operation.
4998 Value *LHS = nullptr;
4999
5000 /// Right operand of the reduction operation.
5001 Value *RHS = nullptr;
5002
5003 /// Kind of the reduction operation.
5004 ReductionKind Kind = RK_None;
5005
5006 /// True if float point min/max reduction has no NaNs.
5007 bool NoNaN = false;
5008
5009 /// Checks if the reduction operation can be vectorized.
5010 bool isVectorizable() const {
5011 return LHS && RHS &&
5012 // We currently only support adds && min/max reductions.
5013 ((Kind == RK_Arithmetic &&
5014 (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) ||
5015 ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
5016 (Kind == RK_Min || Kind == RK_Max)) ||
5017 (Opcode == Instruction::ICmp &&
5018 (Kind == RK_UMin || Kind == RK_UMax)));
5019 }
5020
5021 /// Creates reduction operation with the current opcode.
5022 Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
5023 assert(isVectorizable() &&(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5024, __extension__ __PRETTY_FUNCTION__))
5024 "Expected add|fadd or min/max reduction operation.")(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5024, __extension__ __PRETTY_FUNCTION__))
;
5025 Value *Cmp;
5026 switch (Kind) {
5027 case RK_Arithmetic:
5028 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
5029 Name);
5030 case RK_Min:
5031 Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
5032 : Builder.CreateFCmpOLT(LHS, RHS);
5033 break;
5034 case RK_Max:
5035 Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
5036 : Builder.CreateFCmpOGT(LHS, RHS);
5037 break;
5038 case RK_UMin:
5039 assert(Opcode == Instruction::ICmp && "Expected integer types.")(static_cast <bool> (Opcode == Instruction::ICmp &&
"Expected integer types.") ? void (0) : __assert_fail ("Opcode == Instruction::ICmp && \"Expected integer types.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5039, __extension__ __PRETTY_FUNCTION__))
;
5040 Cmp = Builder.CreateICmpULT(LHS, RHS);
5041 break;
5042 case RK_UMax:
5043 assert(Opcode == Instruction::ICmp && "Expected integer types.")(static_cast <bool> (Opcode == Instruction::ICmp &&
"Expected integer types.") ? void (0) : __assert_fail ("Opcode == Instruction::ICmp && \"Expected integer types.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5043, __extension__ __PRETTY_FUNCTION__))
;
5044 Cmp = Builder.CreateICmpUGT(LHS, RHS);
5045 break;
5046 case RK_None:
5047 llvm_unreachable("Unknown reduction operation.")::llvm::llvm_unreachable_internal("Unknown reduction operation."
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5047)
;
5048 }
5049 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
5050 }
5051
5052 public:
5053 explicit OperationData() = default;
5054
5055 /// Construction for reduced values. They are identified by opcode only and
5056 /// don't have associated LHS/RHS values.
5057 explicit OperationData(Value *V) {
5058 if (auto *I = dyn_cast<Instruction>(V))
5059 Opcode = I->getOpcode();
5060 }
5061
5062 /// Constructor for reduction operations with opcode and its left and
5063 /// right operands.
5064 OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
5065 bool NoNaN = false)
5066 : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
5067 assert(Kind != RK_None && "One of the reduction operations is expected.")(static_cast <bool> (Kind != RK_None && "One of the reduction operations is expected."
) ? void (0) : __assert_fail ("Kind != RK_None && \"One of the reduction operations is expected.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5067, __extension__ __PRETTY_FUNCTION__))
;
5068 }
5069
5070 explicit operator bool() const { return Opcode; }
5071
5072 /// Get the index of the first operand.
5073 unsigned getFirstOperandIndex() const {
5074 assert(!!*this && "The opcode is not set.")(static_cast <bool> (!!*this && "The opcode is not set."
) ? void (0) : __assert_fail ("!!*this && \"The opcode is not set.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5074, __extension__ __PRETTY_FUNCTION__))
;
5075 switch (Kind) {
5076 case RK_Min:
5077 case RK_UMin:
5078 case RK_Max:
5079 case RK_UMax:
5080 return 1;
5081 case RK_Arithmetic:
5082 case RK_None:
5083 break;
5084 }
5085 return 0;
5086 }
5087
5088 /// Total number of operands in the reduction operation.
5089 unsigned getNumberOfOperands() const {
5090 assert(Kind != RK_None && !!*this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5091, __extension__ __PRETTY_FUNCTION__))
5091 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5091, __extension__ __PRETTY_FUNCTION__))
;
5092 switch (Kind) {
5093 case RK_Arithmetic:
5094 return 2;
5095 case RK_Min:
5096 case RK_UMin:
5097 case RK_Max:
5098 case RK_UMax:
5099 return 3;
5100 case RK_None:
5101 break;
5102 }
5103 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5103)
;
5104 }
5105
5106 /// Checks if the operation has the same parent as \p P.
5107 bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
5108 assert(Kind != RK_None && !!*this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5109, __extension__ __PRETTY_FUNCTION__))
5109 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5109, __extension__ __PRETTY_FUNCTION__))
;
5110 if (!IsRedOp)
5111 return I->getParent() == P;
5112 switch (Kind) {
5113 case RK_Arithmetic:
5114 // Arithmetic reduction operation must be used once only.
5115 return I->getParent() == P;
5116 case RK_Min:
5117 case RK_UMin:
5118 case RK_Max:
5119 case RK_UMax: {
5120 // SelectInst must be used twice while the condition op must have single
5121 // use only.
5122 auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
5123 return I->getParent() == P && Cmp && Cmp->getParent() == P;
5124 }
5125 case RK_None:
5126 break;
5127 }
5128 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5128)
;
5129 }
5130 /// Expected number of uses for reduction operations/reduced values.
5131 bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
5132 assert(Kind != RK_None && !!*this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5133, __extension__ __PRETTY_FUNCTION__))
5133 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5133, __extension__ __PRETTY_FUNCTION__))
;
5134 switch (Kind) {
5135 case RK_Arithmetic:
5136 return I->hasOneUse();
5137 case RK_Min:
5138 case RK_UMin:
5139 case RK_Max:
5140 case RK_UMax:
5141 return I->hasNUses(2) &&
5142 (!IsReductionOp ||
5143 cast<SelectInst>(I)->getCondition()->hasOneUse());
5144 case RK_None:
5145 break;
5146 }
5147 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5147)
;
5148 }
5149
5150 /// Initializes the list of reduction operations.
5151 void initReductionOps(ReductionOpsListType &ReductionOps) {
5152 assert(Kind != RK_None && !!*this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5153, __extension__ __PRETTY_FUNCTION__))
5153 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5153, __extension__ __PRETTY_FUNCTION__))
;
5154 switch (Kind) {
5155 case RK_Arithmetic:
5156 ReductionOps.assign(1, ReductionOpsType());
5157 break;
5158 case RK_Min:
5159 case RK_UMin:
5160 case RK_Max:
5161 case RK_UMax:
5162 ReductionOps.assign(2, ReductionOpsType());
5163 break;
5164 case RK_None:
5165 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5165)
;
5166 }
5167 }
5168 /// Add all reduction operations for the reduction instruction \p I.
5169 void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
5170 assert(Kind != RK_None && !!*this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5171, __extension__ __PRETTY_FUNCTION__))
5171 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && !!*this
&& LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && !!*this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5171, __extension__ __PRETTY_FUNCTION__))
;
5172 switch (Kind) {
5173 case RK_Arithmetic:
5174 ReductionOps[0].emplace_back(I);
5175 break;
5176 case RK_Min:
5177 case RK_UMin:
5178 case RK_Max:
5179 case RK_UMax:
5180 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
5181 ReductionOps[1].emplace_back(I);
5182 break;
5183 case RK_None:
5184 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5184)
;
5185 }
5186 }
5187
5188 /// Checks if instruction is associative and can be vectorized.
5189 bool isAssociative(Instruction *I) const {
5190 assert(Kind != RK_None && *this && LHS && RHS &&(static_cast <bool> (Kind != RK_None && *this &&
LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && *this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5191, __extension__ __PRETTY_FUNCTION__))
5191 "Expected reduction operation.")(static_cast <bool> (Kind != RK_None && *this &&
LHS && RHS && "Expected reduction operation."
) ? void (0) : __assert_fail ("Kind != RK_None && *this && LHS && RHS && \"Expected reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5191, __extension__ __PRETTY_FUNCTION__))
;
5192 switch (Kind) {
5193 case RK_Arithmetic:
5194 return I->isAssociative();
5195 case RK_Min:
5196 case RK_Max:
5197 return Opcode == Instruction::ICmp ||
5198 cast<Instruction>(I->getOperand(0))->isFast();
5199 case RK_UMin:
5200 case RK_UMax:
5201 assert(Opcode == Instruction::ICmp &&(static_cast <bool> (Opcode == Instruction::ICmp &&
"Only integer compare operation is expected.") ? void (0) : __assert_fail
("Opcode == Instruction::ICmp && \"Only integer compare operation is expected.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5202, __extension__ __PRETTY_FUNCTION__))
5202 "Only integer compare operation is expected.")(static_cast <bool> (Opcode == Instruction::ICmp &&
"Only integer compare operation is expected.") ? void (0) : __assert_fail
("Opcode == Instruction::ICmp && \"Only integer compare operation is expected.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5202, __extension__ __PRETTY_FUNCTION__))
;
5203 return true;
5204 case RK_None:
5205 break;
5206 }
5207 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5207)
;
5208 }
5209
5210 /// Checks if the reduction operation can be vectorized.
5211 bool isVectorizable(Instruction *I) const {
5212 return isVectorizable() && isAssociative(I);
5213 }
5214
5215 /// Checks if two operation data are both a reduction op or both a reduced
5216 /// value.
5217 bool operator==(const OperationData &OD) {
5218 assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&(static_cast <bool> (((Kind != OD.Kind) || ((!LHS == !OD
.LHS) && (!RHS == !OD.RHS))) && "One of the comparing operations is incorrect."
) ? void (0) : __assert_fail ("((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && \"One of the comparing operations is incorrect.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5219, __extension__ __PRETTY_FUNCTION__))
5219 "One of the comparing operations is incorrect.")(static_cast <bool> (((Kind != OD.Kind) || ((!LHS == !OD
.LHS) && (!RHS == !OD.RHS))) && "One of the comparing operations is incorrect."
) ? void (0) : __assert_fail ("((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && \"One of the comparing operations is incorrect.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5219, __extension__ __PRETTY_FUNCTION__))
;
5220 return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
5221 }
5222 bool operator!=(const OperationData &OD) { return !(*this == OD); }
5223 void clear() {
5224 Opcode = 0;
5225 LHS = nullptr;
5226 RHS = nullptr;
5227 Kind = RK_None;
5228 NoNaN = false;
5229 }
5230
5231 /// Get the opcode of the reduction operation.
5232 unsigned getOpcode() const {
5233 assert(isVectorizable() && "Expected vectorizable operation.")(static_cast <bool> (isVectorizable() && "Expected vectorizable operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected vectorizable operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5233, __extension__ __PRETTY_FUNCTION__))
;
5234 return Opcode;
5235 }
5236
5237 /// Get kind of reduction data.
5238 ReductionKind getKind() const { return Kind; }
5239 Value *getLHS() const { return LHS; }
5240 Value *getRHS() const { return RHS; }
5241 Type *getConditionType() const {
5242 switch (Kind) {
5243 case RK_Arithmetic:
5244 return nullptr;
5245 case RK_Min:
5246 case RK_Max:
5247 case RK_UMin:
5248 case RK_UMax:
5249 return CmpInst::makeCmpResultType(LHS->getType());
5250 case RK_None:
5251 break;
5252 }
5253 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5253)
;
5254 }
5255
5256 /// Creates reduction operation with the current opcode with the IR flags
5257 /// from \p ReductionOps.
5258 Value *createOp(IRBuilder<> &Builder, const Twine &Name,
5259 const ReductionOpsListType &ReductionOps) const {
5260 assert(isVectorizable() &&(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5261, __extension__ __PRETTY_FUNCTION__))
5261 "Expected add|fadd or min/max reduction operation.")(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5261, __extension__ __PRETTY_FUNCTION__))
;
5262 auto *Op = createOp(Builder, Name);
5263 switch (Kind) {
5264 case RK_Arithmetic:
5265 propagateIRFlags(Op, ReductionOps[0]);
5266 return Op;
5267 case RK_Min:
5268 case RK_Max:
5269 case RK_UMin:
5270 case RK_UMax:
5271 if (auto *SI = dyn_cast<SelectInst>(Op))
5272 propagateIRFlags(SI->getCondition(), ReductionOps[0]);
5273 propagateIRFlags(Op, ReductionOps[1]);
5274 return Op;
5275 case RK_None:
5276 break;
5277 }
5278 llvm_unreachable("Unknown reduction operation.")::llvm::llvm_unreachable_internal("Unknown reduction operation."
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5278)
;
5279 }
5280 /// Creates reduction operation with the current opcode with the IR flags
5281 /// from \p I.
5282 Value *createOp(IRBuilder<> &Builder, const Twine &Name,
5283 Instruction *I) const {
5284 assert(isVectorizable() &&(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5285, __extension__ __PRETTY_FUNCTION__))
5285 "Expected add|fadd or min/max reduction operation.")(static_cast <bool> (isVectorizable() && "Expected add|fadd or min/max reduction operation."
) ? void (0) : __assert_fail ("isVectorizable() && \"Expected add|fadd or min/max reduction operation.\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5285, __extension__ __PRETTY_FUNCTION__))
;
5286 auto *Op = createOp(Builder, Name);
5287 switch (Kind) {
5288 case RK_Arithmetic:
5289 propagateIRFlags(Op, I);
5290 return Op;
5291 case RK_Min:
5292 case RK_Max:
5293 case RK_UMin:
5294 case RK_UMax:
5295 if (auto *SI = dyn_cast<SelectInst>(Op)) {
5296 propagateIRFlags(SI->getCondition(),
5297 cast<SelectInst>(I)->getCondition());
5298 }
5299 propagateIRFlags(Op, I);
5300 return Op;
5301 case RK_None:
5302 break;
5303 }
5304 llvm_unreachable("Unknown reduction operation.")::llvm::llvm_unreachable_internal("Unknown reduction operation."
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5304)
;
5305 }
5306
5307 TargetTransformInfo::ReductionFlags getFlags() const {
5308 TargetTransformInfo::ReductionFlags Flags;
5309 Flags.NoNaN = NoNaN;
5310 switch (Kind) {
5311 case RK_Arithmetic:
5312 break;
5313 case RK_Min:
5314 Flags.IsSigned = Opcode == Instruction::ICmp;
5315 Flags.IsMaxOp = false;
5316 break;
5317 case RK_Max:
5318 Flags.IsSigned = Opcode == Instruction::ICmp;
5319 Flags.IsMaxOp = true;
5320 break;
5321 case RK_UMin:
5322 Flags.IsSigned = false;
5323 Flags.IsMaxOp = false;
5324 break;
5325 case RK_UMax:
5326 Flags.IsSigned = false;
5327 Flags.IsMaxOp = true;
5328 break;
5329 case RK_None:
5330 llvm_unreachable("Reduction kind is not set")::llvm::llvm_unreachable_internal("Reduction kind is not set"
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5330)
;
5331 }
5332 return Flags;
5333 }
5334 };
5335
5336 Instruction *ReductionRoot = nullptr;
5337
5338 /// The operation data of the reduction operation.
5339 OperationData ReductionData;
5340
5341 /// The operation data of the values we perform a reduction on.
5342 OperationData ReducedValueData;
5343
5344 /// Should we model this reduction as a pairwise reduction tree or a tree that
5345 /// splits the vector in halves and adds those halves.
5346 bool IsPairwiseReduction = false;
5347
5348 /// Checks if the ParentStackElem.first should be marked as a reduction
5349 /// operation with an extra argument or as extra argument itself.
5350 void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
5351 Value *ExtraArg) {
5352 if (ExtraArgs.count(ParentStackElem.first)) {
5353 ExtraArgs[ParentStackElem.first] = nullptr;
5354 // We ran into something like:
5355 // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
5356 // The whole ParentStackElem.first should be considered as an extra value
5357 // in this case.
5358 // Do not perform analysis of remaining operands of ParentStackElem.first
5359 // instruction, this whole instruction is an extra argument.
5360 ParentStackElem.second = ParentStackElem.first->getNumOperands();
5361 } else {
5362 // We ran into something like:
5363 // ParentStackElem.first += ... + ExtraArg + ...
5364 ExtraArgs[ParentStackElem.first] = ExtraArg;
5365 }
5366 }
5367
5368 static OperationData getOperationData(Value *V) {
5369 if (!V)
5370 return OperationData();
5371
5372 Value *LHS;
5373 Value *RHS;
5374 if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
5375 return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
5376 RK_Arithmetic);
5377 }
5378 if (auto *Select = dyn_cast<SelectInst>(V)) {
5379 // Look for a min/max pattern.
5380 if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
5381 return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
5382 } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
5383 return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
5384 } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
5385 m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
5386 return OperationData(
5387 Instruction::FCmp, LHS, RHS, RK_Min,
5388 cast<Instruction>(Select->getCondition())->hasNoNaNs());
5389 } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
5390 return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
5391 } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
5392 return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
5393 } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
5394 m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
5395 return OperationData(
5396 Instruction::FCmp, LHS, RHS, RK_Max,
5397 cast<Instruction>(Select->getCondition())->hasNoNaNs());
5398 }
5399 }
5400 return OperationData(V);
5401 }
5402
5403public:
5404 HorizontalReduction() = default;
5405
5406 /// \brief Try to find a reduction tree.
5407 bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
5408 assert((!Phi || is_contained(Phi->operands(), B)) &&(static_cast <bool> ((!Phi || is_contained(Phi->operands
(), B)) && "Thi phi needs to use the binary operator"
) ? void (0) : __assert_fail ("(!Phi || is_contained(Phi->operands(), B)) && \"Thi phi needs to use the binary operator\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5409, __extension__ __PRETTY_FUNCTION__))
5409 "Thi phi needs to use the binary operator")(static_cast <bool> ((!Phi || is_contained(Phi->operands
(), B)) && "Thi phi needs to use the binary operator"
) ? void (0) : __assert_fail ("(!Phi || is_contained(Phi->operands(), B)) && \"Thi phi needs to use the binary operator\""
, "/build/llvm-toolchain-snapshot-7~svn325874/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5409, __extension__ __PRETTY_FUNCTION__))
;
5410
5411 ReductionData = getOperationData(B);
5412
5413 // We could have a initial reductions that is not an add.
5414 // r *= v1 + v2 + v3 + v4
5415 // In such a case start looking for a tree rooted in the first '+'.
5416 if (Phi) {
5417 if (ReductionData.getLHS() == Phi) {
5418 Phi = nullptr;
5419 B = dyn_cast<Instruction>(ReductionData.getRHS());
5420 ReductionData = getOperationData(B);
5421 } else if (ReductionData.getRHS() == Phi) {
5422 Phi = nullptr;
5423 B = dyn_cast<Instruction>(ReductionData.getLHS());
5424 ReductionData = getOperationData(B);
5425 }
5426 }
5427
5428 if (!ReductionData.isVectorizable(B))
5429 return false;
5430
5431 Type *Ty = B->getType();
5432 if (!isValidElementType(Ty))
5433 return false;
5434
5435 ReducedValueData.clear();
5436 ReductionRoot = B;
5437
5438 // Post order traverse the reduction tree starting at B. We only handle true
5439 // trees containing only binary operators.
5440 SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
5441 Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
5442 ReductionData.initReductionOps(ReductionOps);
5443 while (!Stack.empty()) {
5444 Instruction *TreeN = Stack.back().first;
5445 unsigned EdgeToVist = Stack.back().second++;
5446 OperationData OpData = getOperationData(TreeN);
5447 bool IsReducedValue = OpData != ReductionData;
5448
5449 // Postorder vist.
5450 if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
5451 if (IsReducedValue)
5452 ReducedVals.push_back(TreeN);
5453 else {
5454 auto I = ExtraArgs.find(TreeN);
5455 if (I != ExtraArgs.end() && !I->second) {
5456 // Check if TreeN is an extra argument of its parent operation.
5457 if (Stack.size() <= 1) {
5458 // TreeN can't be an extra argument as it is a root reduction
5459 // operation.
5460 return false;
5461 }
5462 // Yes, TreeN is an extra argument, do not add it to a list of
5463 // reduction operations.
5464 // Stack[Stack.size() - 2] always points to the parent operation.
5465 markExtraArg(Stack[Stack.size() - 2], TreeN);
5466 ExtraArgs.erase(TreeN);
5467 } else
5468 ReductionData.addReductionOps(TreeN, ReductionOps);
5469 }
5470 // Retract.
5471 Stack.pop_back();
5472 continue;
5473 }
5474
5475 // Visit left or right.
5476 Value *NextV = TreeN->getOperand(EdgeToVist);
5477 if (NextV != Phi) {
5478 auto *I = dyn_cast<Instruction>(NextV);
5479 OpData = getOperationData(I);
5480 // Continue analysis if the next operand is a reduction operation or
5481 // (possibly) a reduced value. If the reduced value opcode is not set,
5482 // the first met operation != reduction operation is considered as the
5483 // reduced value class.
5484 if (I && (!ReducedValueData || OpData == ReducedValueData ||
5485 OpData == ReductionData)) {
5486 const bool IsReductionOperation = OpData == ReductionData;
5487 // Only handle trees in the current basic block.
5488 if (!ReductionData.hasSameParent(I, B->getParent(),
5489 IsReductionOperation)) {
5490 // I is an extra argument for TreeN (its parent operation).
5491 markExtraArg(Stack.back(), I);
5492 continue;
5493 }
5494
5495 // Each tree node needs to have minimal number of users except for the
5496 // ultimate reduction.
5497 if (!ReductionData.hasRequiredNumberOfUses(I,
5498 OpData == ReductionData) &&
5499 I != B) {
5500 // I is an extra argument for TreeN (its parent operation).
5501 markExtraArg(Stack.back(), I);
5502 continue;
5503 }
5504
5505 if (IsReductionOperation) {
5506 // We need to be able to reassociate the reduction operations.
5507 if (!OpData.isAssociative(I)) {
5508 // I is an extra argument for TreeN (its parent operation).
5509 markExtraArg(Stack.back(), I);
5510 continue;
5511 }
5512 } else if (ReducedValueData &&
5513 ReducedValueData != OpData) {
5514 // Make sure that the opcodes of the operations that we are going to
5515 // reduce match.
5516 // I is an extra argument for TreeN (its parent operation).
5517 markExtraArg(Stack.back(), I);
5518 continue;
5519 } else if (!ReducedValueData)
5520 ReducedValueData = OpData;
5521
5522 Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
5523 continue;
5524 }
5525 }
5526 // NextV is an extra argument for TreeN (its parent operation).
5527 markExtraArg(Stack.back(), NextV);
5528 }
5529 return true;
5530 }
5531
5532 /// \brief Attempt to vectorize the tree found by
5533 /// matchAssociativeReduction.
5534 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
5535 if (ReducedVals.empty())
5536 return false;
5537
5538 // If there is a sufficient number of reduction values, reduce
5539 // to a nearby power-of-2. Can safely generate oversized
5540 // vectors and rely on the backend to split them to legal sizes.
5541 unsigned NumReducedVals = ReducedVals.size();
5542 if (NumReducedVals < 4)
5543 return false;
5544
5545 unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
5546
5547 Value *VectorizedTree = nullptr;
5548 IRBuilder<> Builder(ReductionRoot);
5549 FastMathFlags Unsafe;
5550 Unsafe.setFast();
5551 Builder.setFastMathFlags(Unsafe);
5552 unsigned i = 0;
5553
5554 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
5555 // The same extra argument may be used several time, so log each attempt
5556 // to use it.
5557 for (auto &Pair : ExtraArgs)
5558 ExternallyUsedValues[Pair.second].push_back(Pair.first);
5559 SmallVector<Value *, 16> IgnoreList;
5560 for (auto &V : ReductionOps)
5561 IgnoreList.append(V.begin(), V.end());
5562 while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
5563 auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
5564 V.buildTree(VL, ExternallyUsedValues, IgnoreList);
5565 if (V.shouldReorder()) {
5566 SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
5567 V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
5568 }
5569 if (V.isTreeTinyAndNotFullyVectorizable())
5570 break;
5571
5572 V.computeMinimumValueSizes();
5573
5574 // Estimate cost.
5575 int Cost =
5576 V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
5577 if (Cost >= -SLPCostThreshold) {
5578 V.getORE()->emit([&]() {
5579 return OptimizationRemarkMissed(
5580 SV_NAME"slp-vectorizer", "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
5581 << "Vectorizing horizontal reduction is possible"
5582 << "but not beneficial with cost "
5583 << ore::NV("Cost", Cost) << " and threshold "
5584 << ore::NV("Threshold", -SLPCostThreshold);
5585 });
5586 break;
5587 }
5588
5589 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Costdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n"; } } while (false)
5590 << ". (HorRdx)\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n"; } } while (false)
;
5591 V.getORE()->emit([&]() {
5592 return OptimizationRemark(
5593 SV_NAME"slp-vectorizer", "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
5594 << "Vectorized horizontal reduction with cost "
5595 << ore::NV("Cost", Cost) << " and with tree size "
5596 << ore::NV("TreeSize", V.getTreeSize());
5597 });
5598
5599 // Vectorize a tree.
5600 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
5601 Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
5602
5603 // Emit a reduction.
5604 Value *ReducedSubTree =
5605 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
5606 if (VectorizedTree) {
5607 Builder.SetCurrentDebugLocation(Loc);
5608 OperationData VectReductionData(ReductionData.getOpcode(),
5609 VectorizedTree, ReducedSubTree,
5610 ReductionData.getKind());
5611 VectorizedTree =
5612 VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
5613 } else
5614 VectorizedTree = ReducedSubTree;
5615 i += ReduxWidth;
5616 ReduxWidth = PowerOf2Floor(NumReducedVals - i);
5617 }
5618
5619 if (VectorizedTree) {
5620 // Finish the reduction.
5621 for (; i < NumReducedVals; ++i) {
5622 auto *I = cast<Instruction>(ReducedVals[i]);
5623 Builder.SetCurrentDebugLocation(I->getDebugLoc());
5624 OperationData VectReductionData(ReductionData.getOpcode(),