Bug Summary

File:llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Warning:line 4777, column 22
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name SLPVectorizer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/include -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-01-13-084841-49055-1 -x c++ /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/DenseMap.h"
22#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/MapVector.h"
24#include "llvm/ADT/None.h"
25#include "llvm/ADT/Optional.h"
26#include "llvm/ADT/PostOrderIterator.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallBitVector.h"
30#include "llvm/ADT/SmallPtrSet.h"
31#include "llvm/ADT/SmallSet.h"
32#include "llvm/ADT/SmallVector.h"
33#include "llvm/ADT/Statistic.h"
34#include "llvm/ADT/iterator.h"
35#include "llvm/ADT/iterator_range.h"
36#include "llvm/Analysis/AliasAnalysis.h"
37#include "llvm/Analysis/CodeMetrics.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/LoopAccessAnalysis.h"
41#include "llvm/Analysis/LoopInfo.h"
42#include "llvm/Analysis/MemoryLocation.h"
43#include "llvm/Analysis/OptimizationRemarkEmitter.h"
44#include "llvm/Analysis/ScalarEvolution.h"
45#include "llvm/Analysis/ScalarEvolutionExpressions.h"
46#include "llvm/Analysis/TargetLibraryInfo.h"
47#include "llvm/Analysis/TargetTransformInfo.h"
48#include "llvm/Analysis/ValueTracking.h"
49#include "llvm/Analysis/VectorUtils.h"
50#include "llvm/IR/Attributes.h"
51#include "llvm/IR/BasicBlock.h"
52#include "llvm/IR/Constant.h"
53#include "llvm/IR/Constants.h"
54#include "llvm/IR/DataLayout.h"
55#include "llvm/IR/DebugLoc.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
62#include "llvm/IR/Instructions.h"
63#include "llvm/IR/IntrinsicInst.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/NoFolder.h"
67#include "llvm/IR/Operator.h"
68#include "llvm/IR/PassManager.h"
69#include "llvm/IR/PatternMatch.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
75#include "llvm/IR/Verifier.h"
76#include "llvm/InitializePasses.h"
77#include "llvm/Pass.h"
78#include "llvm/Support/Casting.h"
79#include "llvm/Support/CommandLine.h"
80#include "llvm/Support/Compiler.h"
81#include "llvm/Support/DOTGraphTraits.h"
82#include "llvm/Support/Debug.h"
83#include "llvm/Support/ErrorHandling.h"
84#include "llvm/Support/GraphWriter.h"
85#include "llvm/Support/KnownBits.h"
86#include "llvm/Support/MathExtras.h"
87#include "llvm/Support/raw_ostream.h"
88#include "llvm/Transforms/Utils/LoopUtils.h"
89#include "llvm/Transforms/Vectorize.h"
90#include <algorithm>
91#include <cassert>
92#include <cstdint>
93#include <iterator>
94#include <memory>
95#include <set>
96#include <string>
97#include <tuple>
98#include <utility>
99#include <vector>
100
101using namespace llvm;
102using namespace llvm::PatternMatch;
103using namespace slpvectorizer;
104
105#define SV_NAME"slp-vectorizer" "slp-vectorizer"
106#define DEBUG_TYPE"SLP" "SLP"
107
108STATISTIC(NumVectorInstructions, "Number of vector instructions generated")static llvm::Statistic NumVectorInstructions = {"SLP", "NumVectorInstructions"
, "Number of vector instructions generated"}
;
109
110cl::opt<bool>
111 llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden,
112 cl::desc("Run the SLP vectorization passes"));
113
114static cl::opt<int>
115 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
116 cl::desc("Only vectorize if you gain more than this "
117 "number "));
118
119static cl::opt<bool>
120ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
121 cl::desc("Attempt to vectorize horizontal reductions"));
122
123static cl::opt<bool> ShouldStartVectorizeHorAtStore(
124 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
125 cl::desc(
126 "Attempt to vectorize horizontal reductions feeding into a store"));
127
128static cl::opt<int>
129MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
130 cl::desc("Attempt to vectorize for this register size in bits"));
131
132static cl::opt<int>
133MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
134 cl::desc("Maximum depth of the lookup for consecutive stores."));
135
136/// Limits the size of scheduling regions in a block.
137/// It avoid long compile times for _very_ large blocks where vector
138/// instructions are spread over a wide range.
139/// This limit is way higher than needed by real-world functions.
140static cl::opt<int>
141ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
142 cl::desc("Limit the size of the SLP scheduling region per block"));
143
144static cl::opt<int> MinVectorRegSizeOption(
145 "slp-min-reg-size", cl::init(128), cl::Hidden,
146 cl::desc("Attempt to vectorize for this register size in bits"));
147
148static cl::opt<unsigned> RecursionMaxDepth(
149 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
150 cl::desc("Limit the recursion depth when building a vectorizable tree"));
151
152static cl::opt<unsigned> MinTreeSize(
153 "slp-min-tree-size", cl::init(3), cl::Hidden,
154 cl::desc("Only vectorize small trees if they are fully vectorizable"));
155
156// The maximum depth that the look-ahead score heuristic will explore.
157// The higher this value, the higher the compilation time overhead.
158static cl::opt<int> LookAheadMaxDepth(
159 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
160 cl::desc("The maximum look-ahead depth for operand reordering scores"));
161
162// The Look-ahead heuristic goes through the users of the bundle to calculate
163// the users cost in getExternalUsesCost(). To avoid compilation time increase
164// we limit the number of users visited to this value.
165static cl::opt<unsigned> LookAheadUsersBudget(
166 "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
167 cl::desc("The maximum number of users to visit while visiting the "
168 "predecessors. This prevents compilation time increase."));
169
170static cl::opt<bool>
171 ViewSLPTree("view-slp-tree", cl::Hidden,
172 cl::desc("Display the SLP trees with Graphviz"));
173
174// Limit the number of alias checks. The limit is chosen so that
175// it has no negative effect on the llvm benchmarks.
176static const unsigned AliasedCheckLimit = 10;
177
178// Another limit for the alias checks: The maximum distance between load/store
179// instructions where alias checks are done.
180// This limit is useful for very large basic blocks.
181static const unsigned MaxMemDepDistance = 160;
182
183/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
184/// regions to be handled.
185static const int MinScheduleRegionSize = 16;
186
187/// Predicate for the element types that the SLP vectorizer supports.
188///
189/// The most important thing to filter here are types which are invalid in LLVM
190/// vectors. We also filter target specific types which have absolutely no
191/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
192/// avoids spending time checking the cost model and realizing that they will
193/// be inevitably scalarized.
194static bool isValidElementType(Type *Ty) {
195 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
196 !Ty->isPPC_FP128Ty();
197}
198
199/// \returns true if all of the instructions in \p VL are in the same block or
200/// false otherwise.
201static bool allSameBlock(ArrayRef<Value *> VL) {
202 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
203 if (!I0)
204 return false;
205 BasicBlock *BB = I0->getParent();
206 for (int i = 1, e = VL.size(); i < e; i++) {
207 Instruction *I = dyn_cast<Instruction>(VL[i]);
208 if (!I)
209 return false;
210
211 if (BB != I->getParent())
212 return false;
213 }
214 return true;
215}
216
217/// \returns True if all of the values in \p VL are constants (but not
218/// globals/constant expressions).
219static bool allConstant(ArrayRef<Value *> VL) {
220 // Constant expressions and globals can't be vectorized like normal integer/FP
221 // constants.
222 for (Value *i : VL)
223 if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
224 return false;
225 return true;
226}
227
228/// \returns True if all of the values in \p VL are identical.
229static bool isSplat(ArrayRef<Value *> VL) {
230 for (unsigned i = 1, e = VL.size(); i < e; ++i)
231 if (VL[i] != VL[0])
232 return false;
233 return true;
234}
235
236/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
237static bool isCommutative(Instruction *I) {
238 if (auto *IC = dyn_cast<CmpInst>(I))
239 return IC->isCommutative();
240 return I->isCommutative();
241}
242
243/// Checks if the vector of instructions can be represented as a shuffle, like:
244/// %x0 = extractelement <4 x i8> %x, i32 0
245/// %x3 = extractelement <4 x i8> %x, i32 3
246/// %y1 = extractelement <4 x i8> %y, i32 1
247/// %y2 = extractelement <4 x i8> %y, i32 2
248/// %x0x0 = mul i8 %x0, %x0
249/// %x3x3 = mul i8 %x3, %x3
250/// %y1y1 = mul i8 %y1, %y1
251/// %y2y2 = mul i8 %y2, %y2
252/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
253/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
254/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
255/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
256/// ret <4 x i8> %ins4
257/// can be transformed into:
258/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
259/// i32 6>
260/// %2 = mul <4 x i8> %1, %1
261/// ret <4 x i8> %2
262/// We convert this initially to something like:
263/// %x0 = extractelement <4 x i8> %x, i32 0
264/// %x3 = extractelement <4 x i8> %x, i32 3
265/// %y1 = extractelement <4 x i8> %y, i32 1
266/// %y2 = extractelement <4 x i8> %y, i32 2
267/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
268/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
269/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
270/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
271/// %5 = mul <4 x i8> %4, %4
272/// %6 = extractelement <4 x i8> %5, i32 0
273/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
274/// %7 = extractelement <4 x i8> %5, i32 1
275/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
276/// %8 = extractelement <4 x i8> %5, i32 2
277/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
278/// %9 = extractelement <4 x i8> %5, i32 3
279/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
280/// ret <4 x i8> %ins4
281/// InstCombiner transforms this into a shuffle and vector mul
282/// TODO: Can we split off and reuse the shuffle mask detection from
283/// TargetTransformInfo::getInstructionThroughput?
284static Optional<TargetTransformInfo::ShuffleKind>
285isShuffle(ArrayRef<Value *> VL) {
286 auto *EI0 = cast<ExtractElementInst>(VL[0]);
287 unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
288 Value *Vec1 = nullptr;
289 Value *Vec2 = nullptr;
290 enum ShuffleMode { Unknown, Select, Permute };
291 ShuffleMode CommonShuffleMode = Unknown;
292 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
293 auto *EI = cast<ExtractElementInst>(VL[I]);
294 auto *Vec = EI->getVectorOperand();
295 // All vector operands must have the same number of vector elements.
296 if (Vec->getType()->getVectorNumElements() != Size)
297 return None;
298 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
299 if (!Idx)
300 return None;
301 // Undefined behavior if Idx is negative or >= Size.
302 if (Idx->getValue().uge(Size))
303 continue;
304 unsigned IntIdx = Idx->getValue().getZExtValue();
305 // We can extractelement from undef vector.
306 if (isa<UndefValue>(Vec))
307 continue;
308 // For correct shuffling we have to have at most 2 different vector operands
309 // in all extractelement instructions.
310 if (!Vec1 || Vec1 == Vec)
311 Vec1 = Vec;
312 else if (!Vec2 || Vec2 == Vec)
313 Vec2 = Vec;
314 else
315 return None;
316 if (CommonShuffleMode == Permute)
317 continue;
318 // If the extract index is not the same as the operation number, it is a
319 // permutation.
320 if (IntIdx != I) {
321 CommonShuffleMode = Permute;
322 continue;
323 }
324 CommonShuffleMode = Select;
325 }
326 // If we're not crossing lanes in different vectors, consider it as blending.
327 if (CommonShuffleMode == Select && Vec2)
328 return TargetTransformInfo::SK_Select;
329 // If Vec2 was never used, we have a permutation of a single vector, otherwise
330 // we have permutation of 2 vectors.
331 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
332 : TargetTransformInfo::SK_PermuteSingleSrc;
333}
334
335namespace {
336
337/// Main data required for vectorization of instructions.
338struct InstructionsState {
339 /// The very first instruction in the list with the main opcode.
340 Value *OpValue = nullptr;
341
342 /// The main/alternate instruction.
343 Instruction *MainOp = nullptr;
344 Instruction *AltOp = nullptr;
345
346 /// The main/alternate opcodes for the list of instructions.
347 unsigned getOpcode() const {
348 return MainOp ? MainOp->getOpcode() : 0;
349 }
350
351 unsigned getAltOpcode() const {
352 return AltOp ? AltOp->getOpcode() : 0;
353 }
354
355 /// Some of the instructions in the list have alternate opcodes.
356 bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
357
358 bool isOpcodeOrAlt(Instruction *I) const {
359 unsigned CheckedOpcode = I->getOpcode();
360 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
361 }
362
363 InstructionsState() = delete;
364 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
365 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
366};
367
368} // end anonymous namespace
369
370/// Chooses the correct key for scheduling data. If \p Op has the same (or
371/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
372/// OpValue.
373static Value *isOneOf(const InstructionsState &S, Value *Op) {
374 auto *I = dyn_cast<Instruction>(Op);
375 if (I && S.isOpcodeOrAlt(I))
376 return Op;
377 return S.OpValue;
378}
379
380/// \returns analysis of the Instructions in \p VL described in
381/// InstructionsState, the Opcode that we suppose the whole list
382/// could be vectorized even if its structure is diverse.
383static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
384 unsigned BaseIndex = 0) {
385 // Make sure these are all Instructions.
386 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
387 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
388
389 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
390 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
391 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
392 unsigned AltOpcode = Opcode;
393 unsigned AltIndex = BaseIndex;
394
395 // Check for one alternate opcode from another BinaryOperator.
396 // TODO - generalize to support all operators (types, calls etc.).
397 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
398 unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
399 if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
400 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
401 continue;
402 if (Opcode == AltOpcode) {
403 AltOpcode = InstOpcode;
404 AltIndex = Cnt;
405 continue;
406 }
407 } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
408 Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
409 Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
410 if (Ty0 == Ty1) {
411 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
412 continue;
413 if (Opcode == AltOpcode) {
414 AltOpcode = InstOpcode;
415 AltIndex = Cnt;
416 continue;
417 }
418 }
419 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
420 continue;
421 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
422 }
423
424 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
425 cast<Instruction>(VL[AltIndex]));
426}
427
428/// \returns true if all of the values in \p VL have the same type or false
429/// otherwise.
430static bool allSameType(ArrayRef<Value *> VL) {
431 Type *Ty = VL[0]->getType();
432 for (int i = 1, e = VL.size(); i < e; i++)
433 if (VL[i]->getType() != Ty)
434 return false;
435
436 return true;
437}
438
439/// \returns True if Extract{Value,Element} instruction extracts element Idx.
440static Optional<unsigned> getExtractIndex(Instruction *E) {
441 unsigned Opcode = E->getOpcode();
442 assert((Opcode == Instruction::ExtractElement ||(((Opcode == Instruction::ExtractElement || Opcode == Instruction
::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? static_cast<void> (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 444, __PRETTY_FUNCTION__))
443 Opcode == Instruction::ExtractValue) &&(((Opcode == Instruction::ExtractElement || Opcode == Instruction
::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? static_cast<void> (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 444, __PRETTY_FUNCTION__))
444 "Expected extractelement or extractvalue instruction.")(((Opcode == Instruction::ExtractElement || Opcode == Instruction
::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? static_cast<void> (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 444, __PRETTY_FUNCTION__))
;
445 if (Opcode == Instruction::ExtractElement) {
446 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
447 if (!CI)
448 return None;
449 return CI->getZExtValue();
450 }
451 ExtractValueInst *EI = cast<ExtractValueInst>(E);
452 if (EI->getNumIndices() != 1)
453 return None;
454 return *EI->idx_begin();
455}
456
457/// \returns True if in-tree use also needs extract. This refers to
458/// possible scalar operand in vectorized instruction.
459static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
460 TargetLibraryInfo *TLI) {
461 unsigned Opcode = UserInst->getOpcode();
462 switch (Opcode) {
463 case Instruction::Load: {
464 LoadInst *LI = cast<LoadInst>(UserInst);
465 return (LI->getPointerOperand() == Scalar);
466 }
467 case Instruction::Store: {
468 StoreInst *SI = cast<StoreInst>(UserInst);
469 return (SI->getPointerOperand() == Scalar);
470 }
471 case Instruction::Call: {
472 CallInst *CI = cast<CallInst>(UserInst);
473 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
474 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
475 if (hasVectorInstrinsicScalarOpd(ID, i))
476 return (CI->getArgOperand(i) == Scalar);
477 }
478 LLVM_FALLTHROUGH[[gnu::fallthrough]];
479 }
480 default:
481 return false;
482 }
483}
484
485/// \returns the AA location that is being access by the instruction.
486static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
487 if (StoreInst *SI = dyn_cast<StoreInst>(I))
488 return MemoryLocation::get(SI);
489 if (LoadInst *LI = dyn_cast<LoadInst>(I))
490 return MemoryLocation::get(LI);
491 return MemoryLocation();
492}
493
494/// \returns True if the instruction is not a volatile or atomic load/store.
495static bool isSimple(Instruction *I) {
496 if (LoadInst *LI = dyn_cast<LoadInst>(I))
497 return LI->isSimple();
498 if (StoreInst *SI = dyn_cast<StoreInst>(I))
499 return SI->isSimple();
500 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
501 return !MI->isVolatile();
502 return true;
503}
504
505namespace llvm {
506
507namespace slpvectorizer {
508
509/// Bottom Up SLP Vectorizer.
510class BoUpSLP {
511 struct TreeEntry;
512 struct ScheduleData;
513
514public:
515 using ValueList = SmallVector<Value *, 8>;
516 using InstrList = SmallVector<Instruction *, 16>;
517 using ValueSet = SmallPtrSet<Value *, 16>;
518 using StoreList = SmallVector<StoreInst *, 8>;
519 using ExtraValueToDebugLocsMap =
520 MapVector<Value *, SmallVector<Instruction *, 2>>;
521
522 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
523 TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
524 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
525 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
526 : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
527 DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
528 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
529 // Use the vector register size specified by the target unless overridden
530 // by a command-line option.
531 // TODO: It would be better to limit the vectorization factor based on
532 // data type rather than just register size. For example, x86 AVX has
533 // 256-bit registers, but it does not support integer operations
534 // at that width (that requires AVX2).
535 if (MaxVectorRegSizeOption.getNumOccurrences())
536 MaxVecRegSize = MaxVectorRegSizeOption;
537 else
538 MaxVecRegSize = TTI->getRegisterBitWidth(true);
539
540 if (MinVectorRegSizeOption.getNumOccurrences())
541 MinVecRegSize = MinVectorRegSizeOption;
542 else
543 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
544 }
545
546 /// Vectorize the tree that starts with the elements in \p VL.
547 /// Returns the vectorized root.
548 Value *vectorizeTree();
549
550 /// Vectorize the tree but with the list of externally used values \p
551 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
552 /// generated extractvalue instructions.
553 Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
554
555 /// \returns the cost incurred by unwanted spills and fills, caused by
556 /// holding live values over call sites.
557 int getSpillCost() const;
558
559 /// \returns the vectorization cost of the subtree that starts at \p VL.
560 /// A negative number means that this is profitable.
561 int getTreeCost();
562
563 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
564 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
565 void buildTree(ArrayRef<Value *> Roots,
566 ArrayRef<Value *> UserIgnoreLst = None);
567
568 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
569 /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
570 /// into account (and updating it, if required) list of externally used
571 /// values stored in \p ExternallyUsedValues.
572 void buildTree(ArrayRef<Value *> Roots,
573 ExtraValueToDebugLocsMap &ExternallyUsedValues,
574 ArrayRef<Value *> UserIgnoreLst = None);
575
576 /// Clear the internal data structures that are created by 'buildTree'.
577 void deleteTree() {
578 VectorizableTree.clear();
579 ScalarToTreeEntry.clear();
580 MustGather.clear();
581 ExternalUses.clear();
582 NumOpsWantToKeepOrder.clear();
583 NumOpsWantToKeepOriginalOrder = 0;
584 for (auto &Iter : BlocksSchedules) {
585 BlockScheduling *BS = Iter.second.get();
586 BS->clear();
587 }
588 MinBWs.clear();
589 }
590
591 unsigned getTreeSize() const { return VectorizableTree.size(); }
592
593 /// Perform LICM and CSE on the newly generated gather sequences.
594 void optimizeGatherSequence();
595
596 /// \returns The best order of instructions for vectorization.
597 Optional<ArrayRef<unsigned>> bestOrder() const {
598 auto I = std::max_element(
599 NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
600 [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
601 const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
602 return D1.second < D2.second;
603 });
604 if (I == NumOpsWantToKeepOrder.end() ||
605 I->getSecond() <= NumOpsWantToKeepOriginalOrder)
606 return None;
607
608 return makeArrayRef(I->getFirst());
609 }
610
611 /// \return The vector element size in bits to use when vectorizing the
612 /// expression tree ending at \p V. If V is a store, the size is the width of
613 /// the stored value. Otherwise, the size is the width of the largest loaded
614 /// value reaching V. This method is used by the vectorizer to calculate
615 /// vectorization factors.
616 unsigned getVectorElementSize(Value *V) const;
617
618 /// Compute the minimum type sizes required to represent the entries in a
619 /// vectorizable tree.
620 void computeMinimumValueSizes();
621
622 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
623 unsigned getMaxVecRegSize() const {
624 return MaxVecRegSize;
625 }
626
627 // \returns minimum vector register size as set by cl::opt.
628 unsigned getMinVecRegSize() const {
629 return MinVecRegSize;
630 }
631
632 /// Check if homogeneous aggregate is isomorphic to some VectorType.
633 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
634 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
635 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
636 ///
637 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
638 unsigned canMapToVector(Type *T, const DataLayout &DL) const;
639
640 /// \returns True if the VectorizableTree is both tiny and not fully
641 /// vectorizable. We do not vectorize such trees.
642 bool isTreeTinyAndNotFullyVectorizable() const;
643
644 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
645 /// can be load combined in the backend. Load combining may not be allowed in
646 /// the IR optimizer, so we do not want to alter the pattern. For example,
647 /// partially transforming a scalar bswap() pattern into vector code is
648 /// effectively impossible for the backend to undo.
649 /// TODO: If load combining is allowed in the IR optimizer, this analysis
650 /// may not be necessary.
651 bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
652
653 OptimizationRemarkEmitter *getORE() { return ORE; }
654
655 /// This structure holds any data we need about the edges being traversed
656 /// during buildTree_rec(). We keep track of:
657 /// (i) the user TreeEntry index, and
658 /// (ii) the index of the edge.
659 struct EdgeInfo {
660 EdgeInfo() = default;
661 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
662 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
663 /// The user TreeEntry.
664 TreeEntry *UserTE = nullptr;
665 /// The operand index of the use.
666 unsigned EdgeIdx = UINT_MAX(2147483647 *2U +1U);
667#ifndef NDEBUG
668 friend inline raw_ostream &operator<<(raw_ostream &OS,
669 const BoUpSLP::EdgeInfo &EI) {
670 EI.dump(OS);
671 return OS;
672 }
673 /// Debug print.
674 void dump(raw_ostream &OS) const {
675 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
676 << " EdgeIdx:" << EdgeIdx << "}";
677 }
678 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { dump(dbgs()); }
679#endif
680 };
681
682 /// A helper data structure to hold the operands of a vector of instructions.
683 /// This supports a fixed vector length for all operand vectors.
684 class VLOperands {
685 /// For each operand we need (i) the value, and (ii) the opcode that it
686 /// would be attached to if the expression was in a left-linearized form.
687 /// This is required to avoid illegal operand reordering.
688 /// For example:
689 /// \verbatim
690 /// 0 Op1
691 /// |/
692 /// Op1 Op2 Linearized + Op2
693 /// \ / ----------> |/
694 /// - -
695 ///
696 /// Op1 - Op2 (0 + Op1) - Op2
697 /// \endverbatim
698 ///
699 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
700 ///
701 /// Another way to think of this is to track all the operations across the
702 /// path from the operand all the way to the root of the tree and to
703 /// calculate the operation that corresponds to this path. For example, the
704 /// path from Op2 to the root crosses the RHS of the '-', therefore the
705 /// corresponding operation is a '-' (which matches the one in the
706 /// linearized tree, as shown above).
707 ///
708 /// For lack of a better term, we refer to this operation as Accumulated
709 /// Path Operation (APO).
710 struct OperandData {
711 OperandData() = default;
712 OperandData(Value *V, bool APO, bool IsUsed)
713 : V(V), APO(APO), IsUsed(IsUsed) {}
714 /// The operand value.
715 Value *V = nullptr;
716 /// TreeEntries only allow a single opcode, or an alternate sequence of
717 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
718 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
719 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
720 /// (e.g., Add/Mul)
721 bool APO = false;
722 /// Helper data for the reordering function.
723 bool IsUsed = false;
724 };
725
726 /// During operand reordering, we are trying to select the operand at lane
727 /// that matches best with the operand at the neighboring lane. Our
728 /// selection is based on the type of value we are looking for. For example,
729 /// if the neighboring lane has a load, we need to look for a load that is
730 /// accessing a consecutive address. These strategies are summarized in the
731 /// 'ReorderingMode' enumerator.
732 enum class ReorderingMode {
733 Load, ///< Matching loads to consecutive memory addresses
734 Opcode, ///< Matching instructions based on opcode (same or alternate)
735 Constant, ///< Matching constants
736 Splat, ///< Matching the same instruction multiple times (broadcast)
737 Failed, ///< We failed to create a vectorizable group
738 };
739
740 using OperandDataVec = SmallVector<OperandData, 2>;
741
742 /// A vector of operand vectors.
743 SmallVector<OperandDataVec, 4> OpsVec;
744
745 const DataLayout &DL;
746 ScalarEvolution &SE;
747 const BoUpSLP &R;
748
749 /// \returns the operand data at \p OpIdx and \p Lane.
750 OperandData &getData(unsigned OpIdx, unsigned Lane) {
751 return OpsVec[OpIdx][Lane];
752 }
753
754 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
755 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
756 return OpsVec[OpIdx][Lane];
757 }
758
759 /// Clears the used flag for all entries.
760 void clearUsed() {
761 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
762 OpIdx != NumOperands; ++OpIdx)
763 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
764 ++Lane)
765 OpsVec[OpIdx][Lane].IsUsed = false;
766 }
767
768 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
769 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
770 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
771 }
772
773 // The hard-coded scores listed here are not very important. When computing
774 // the scores of matching one sub-tree with another, we are basically
775 // counting the number of values that are matching. So even if all scores
776 // are set to 1, we would still get a decent matching result.
777 // However, sometimes we have to break ties. For example we may have to
778 // choose between matching loads vs matching opcodes. This is what these
779 // scores are helping us with: they provide the order of preference.
780
781 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
782 static const int ScoreConsecutiveLoads = 3;
783 /// ExtractElementInst from same vector and consecutive indexes.
784 static const int ScoreConsecutiveExtracts = 3;
785 /// Constants.
786 static const int ScoreConstants = 2;
787 /// Instructions with the same opcode.
788 static const int ScoreSameOpcode = 2;
789 /// Instructions with alt opcodes (e.g, add + sub).
790 static const int ScoreAltOpcodes = 1;
791 /// Identical instructions (a.k.a. splat or broadcast).
792 static const int ScoreSplat = 1;
793 /// Matching with an undef is preferable to failing.
794 static const int ScoreUndef = 1;
795 /// Score for failing to find a decent match.
796 static const int ScoreFail = 0;
797 /// User exteranl to the vectorized code.
798 static const int ExternalUseCost = 1;
799 /// The user is internal but in a different lane.
800 static const int UserInDiffLaneCost = ExternalUseCost;
801
802 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
803 static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
804 ScalarEvolution &SE) {
805 auto *LI1 = dyn_cast<LoadInst>(V1);
806 auto *LI2 = dyn_cast<LoadInst>(V2);
807 if (LI1 && LI2)
808 return isConsecutiveAccess(LI1, LI2, DL, SE)
809 ? VLOperands::ScoreConsecutiveLoads
810 : VLOperands::ScoreFail;
811
812 auto *C1 = dyn_cast<Constant>(V1);
813 auto *C2 = dyn_cast<Constant>(V2);
814 if (C1 && C2)
815 return VLOperands::ScoreConstants;
816
817 // Extracts from consecutive indexes of the same vector better score as
818 // the extracts could be optimized away.
819 auto *Ex1 = dyn_cast<ExtractElementInst>(V1);
820 auto *Ex2 = dyn_cast<ExtractElementInst>(V2);
821 if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&
822 cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==
823 cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {
824 return VLOperands::ScoreConsecutiveExtracts;
825 }
826
827 auto *I1 = dyn_cast<Instruction>(V1);
828 auto *I2 = dyn_cast<Instruction>(V2);
829 if (I1 && I2) {
830 if (I1 == I2)
831 return VLOperands::ScoreSplat;
832 InstructionsState S = getSameOpcode({I1, I2});
833 // Note: Only consider instructions with <= 2 operands to avoid
834 // complexity explosion.
835 if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
836 return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
837 : VLOperands::ScoreSameOpcode;
838 }
839
840 if (isa<UndefValue>(V2))
841 return VLOperands::ScoreUndef;
842
843 return VLOperands::ScoreFail;
844 }
845
846 /// Holds the values and their lane that are taking part in the look-ahead
847 /// score calculation. This is used in the external uses cost calculation.
848 SmallDenseMap<Value *, int> InLookAheadValues;
849
850 /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
851 /// either external to the vectorized code, or require shuffling.
852 int getExternalUsesCost(const std::pair<Value *, int> &LHS,
853 const std::pair<Value *, int> &RHS) {
854 int Cost = 0;
855 SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
856 for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
857 Value *V = Values[Idx].first;
858 // Calculate the absolute lane, using the minimum relative lane of LHS
859 // and RHS as base and Idx as the offset.
860 int Ln = std::min(LHS.second, RHS.second) + Idx;
861 assert(Ln >= 0 && "Bad lane calculation")((Ln >= 0 && "Bad lane calculation") ? static_cast
<void> (0) : __assert_fail ("Ln >= 0 && \"Bad lane calculation\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 861, __PRETTY_FUNCTION__))
;
862 unsigned UsersBudget = LookAheadUsersBudget;
863 for (User *U : V->users()) {
864 if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
865 // The user is in the VectorizableTree. Check if we need to insert.
866 auto It = llvm::find(UserTE->Scalars, U);
867 assert(It != UserTE->Scalars.end() && "U is in UserTE")((It != UserTE->Scalars.end() && "U is in UserTE")
? static_cast<void> (0) : __assert_fail ("It != UserTE->Scalars.end() && \"U is in UserTE\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 867, __PRETTY_FUNCTION__))
;
868 int UserLn = std::distance(UserTE->Scalars.begin(), It);
869 assert(UserLn >= 0 && "Bad lane")((UserLn >= 0 && "Bad lane") ? static_cast<void
> (0) : __assert_fail ("UserLn >= 0 && \"Bad lane\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 869, __PRETTY_FUNCTION__))
;
870 if (UserLn != Ln)
871 Cost += UserInDiffLaneCost;
872 } else {
873 // Check if the user is in the look-ahead code.
874 auto It2 = InLookAheadValues.find(U);
875 if (It2 != InLookAheadValues.end()) {
876 // The user is in the look-ahead code. Check the lane.
877 if (It2->second != Ln)
878 Cost += UserInDiffLaneCost;
879 } else {
880 // The user is neither in SLP tree nor in the look-ahead code.
881 Cost += ExternalUseCost;
882 }
883 }
884 // Limit the number of visited uses to cap compilation time.
885 if (--UsersBudget == 0)
886 break;
887 }
888 }
889 return Cost;
890 }
891
892 /// Go through the operands of \p LHS and \p RHS recursively until \p
893 /// MaxLevel, and return the cummulative score. For example:
894 /// \verbatim
895 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
896 /// \ / \ / \ / \ /
897 /// + + + +
898 /// G1 G2 G3 G4
899 /// \endverbatim
900 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
901 /// each level recursively, accumulating the score. It starts from matching
902 /// the additions at level 0, then moves on to the loads (level 1). The
903 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
904 /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
905 /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
906 /// Please note that the order of the operands does not matter, as we
907 /// evaluate the score of all profitable combinations of operands. In
908 /// other words the score of G1 and G4 is the same as G1 and G2. This
909 /// heuristic is based on ideas described in:
910 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
911 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
912 /// Luís F. W. Góes
913 int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
914 const std::pair<Value *, int> &RHS, int CurrLevel,
915 int MaxLevel) {
916
917 Value *V1 = LHS.first;
918 Value *V2 = RHS.first;
919 // Get the shallow score of V1 and V2.
920 int ShallowScoreAtThisLevel =
921 std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
922 getExternalUsesCost(LHS, RHS));
923 int Lane1 = LHS.second;
924 int Lane2 = RHS.second;
925
926 // If reached MaxLevel,
927 // or if V1 and V2 are not instructions,
928 // or if they are SPLAT,
929 // or if they are not consecutive, early return the current cost.
930 auto *I1 = dyn_cast<Instruction>(V1);
931 auto *I2 = dyn_cast<Instruction>(V2);
932 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
933 ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
934 (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
935 return ShallowScoreAtThisLevel;
936 assert(I1 && I2 && "Should have early exited.")((I1 && I2 && "Should have early exited.") ? static_cast
<void> (0) : __assert_fail ("I1 && I2 && \"Should have early exited.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 936, __PRETTY_FUNCTION__))
;
937
938 // Keep track of in-tree values for determining the external-use cost.
939 InLookAheadValues[V1] = Lane1;
940 InLookAheadValues[V2] = Lane2;
941
942 // Contains the I2 operand indexes that got matched with I1 operands.
943 SmallSet<unsigned, 4> Op2Used;
944
945 // Recursion towards the operands of I1 and I2. We are trying all possbile
946 // operand pairs, and keeping track of the best score.
947 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
948 OpIdx1 != NumOperands1; ++OpIdx1) {
949 // Try to pair op1I with the best operand of I2.
950 int MaxTmpScore = 0;
951 unsigned MaxOpIdx2 = 0;
952 bool FoundBest = false;
953 // If I2 is commutative try all combinations.
954 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
955 unsigned ToIdx = isCommutative(I2)
956 ? I2->getNumOperands()
957 : std::min(I2->getNumOperands(), OpIdx1 + 1);
958 assert(FromIdx <= ToIdx && "Bad index")((FromIdx <= ToIdx && "Bad index") ? static_cast<
void> (0) : __assert_fail ("FromIdx <= ToIdx && \"Bad index\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 958, __PRETTY_FUNCTION__))
;
959 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
960 // Skip operands already paired with OpIdx1.
961 if (Op2Used.count(OpIdx2))
962 continue;
963 // Recursively calculate the cost at each level
964 int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
965 {I2->getOperand(OpIdx2), Lane2},
966 CurrLevel + 1, MaxLevel);
967 // Look for the best score.
968 if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
969 MaxTmpScore = TmpScore;
970 MaxOpIdx2 = OpIdx2;
971 FoundBest = true;
972 }
973 }
974 if (FoundBest) {
975 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
976 Op2Used.insert(MaxOpIdx2);
977 ShallowScoreAtThisLevel += MaxTmpScore;
978 }
979 }
980 return ShallowScoreAtThisLevel;
981 }
982
983 /// \Returns the look-ahead score, which tells us how much the sub-trees
984 /// rooted at \p LHS and \p RHS match, the more they match the higher the
985 /// score. This helps break ties in an informed way when we cannot decide on
986 /// the order of the operands by just considering the immediate
987 /// predecessors.
988 int getLookAheadScore(const std::pair<Value *, int> &LHS,
989 const std::pair<Value *, int> &RHS) {
990 InLookAheadValues.clear();
991 return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
992 }
993
994 // Search all operands in Ops[*][Lane] for the one that matches best
995 // Ops[OpIdx][LastLane] and return its opreand index.
996 // If no good match can be found, return None.
997 Optional<unsigned>
998 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
999 ArrayRef<ReorderingMode> ReorderingModes) {
1000 unsigned NumOperands = getNumOperands();
1001
1002 // The operand of the previous lane at OpIdx.
1003 Value *OpLastLane = getData(OpIdx, LastLane).V;
1004
1005 // Our strategy mode for OpIdx.
1006 ReorderingMode RMode = ReorderingModes[OpIdx];
1007
1008 // The linearized opcode of the operand at OpIdx, Lane.
1009 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1010
1011 // The best operand index and its score.
1012 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1013 // are using the score to differentiate between the two.
1014 struct BestOpData {
1015 Optional<unsigned> Idx = None;
1016 unsigned Score = 0;
1017 } BestOp;
1018
1019 // Iterate through all unused operands and look for the best.
1020 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1021 // Get the operand at Idx and Lane.
1022 OperandData &OpData = getData(Idx, Lane);
1023 Value *Op = OpData.V;
1024 bool OpAPO = OpData.APO;
1025
1026 // Skip already selected operands.
1027 if (OpData.IsUsed)
1028 continue;
1029
1030 // Skip if we are trying to move the operand to a position with a
1031 // different opcode in the linearized tree form. This would break the
1032 // semantics.
1033 if (OpAPO != OpIdxAPO)
1034 continue;
1035
1036 // Look for an operand that matches the current mode.
1037 switch (RMode) {
1038 case ReorderingMode::Load:
1039 case ReorderingMode::Constant:
1040 case ReorderingMode::Opcode: {
1041 bool LeftToRight = Lane > LastLane;
1042 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1043 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1044 unsigned Score =
1045 getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
1046 if (Score > BestOp.Score) {
1047 BestOp.Idx = Idx;
1048 BestOp.Score = Score;
1049 }
1050 break;
1051 }
1052 case ReorderingMode::Splat:
1053 if (Op == OpLastLane)
1054 BestOp.Idx = Idx;
1055 break;
1056 case ReorderingMode::Failed:
1057 return None;
1058 }
1059 }
1060
1061 if (BestOp.Idx) {
1062 getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
1063 return BestOp.Idx;
1064 }
1065 // If we could not find a good match return None.
1066 return None;
1067 }
1068
1069 /// Helper for reorderOperandVecs. \Returns the lane that we should start
1070 /// reordering from. This is the one which has the least number of operands
1071 /// that can freely move about.
1072 unsigned getBestLaneToStartReordering() const {
1073 unsigned BestLane = 0;
1074 unsigned Min = UINT_MAX(2147483647 *2U +1U);
1075 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1076 ++Lane) {
1077 unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
1078 if (NumFreeOps < Min) {
1079 Min = NumFreeOps;
1080 BestLane = Lane;
1081 }
1082 }
1083 return BestLane;
1084 }
1085
1086 /// \Returns the maximum number of operands that are allowed to be reordered
1087 /// for \p Lane. This is used as a heuristic for selecting the first lane to
1088 /// start operand reordering.
1089 unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1090 unsigned CntTrue = 0;
1091 unsigned NumOperands = getNumOperands();
1092 // Operands with the same APO can be reordered. We therefore need to count
1093 // how many of them we have for each APO, like this: Cnt[APO] = x.
1094 // Since we only have two APOs, namely true and false, we can avoid using
1095 // a map. Instead we can simply count the number of operands that
1096 // correspond to one of them (in this case the 'true' APO), and calculate
1097 // the other by subtracting it from the total number of operands.
1098 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
1099 if (getData(OpIdx, Lane).APO)
1100 ++CntTrue;
1101 unsigned CntFalse = NumOperands - CntTrue;
1102 return std::max(CntTrue, CntFalse);
1103 }
1104
1105 /// Go through the instructions in VL and append their operands.
1106 void appendOperandsOfVL(ArrayRef<Value *> VL) {
1107 assert(!VL.empty() && "Bad VL")((!VL.empty() && "Bad VL") ? static_cast<void> (
0) : __assert_fail ("!VL.empty() && \"Bad VL\"", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1107, __PRETTY_FUNCTION__))
;
1108 assert((empty() || VL.size() == getNumLanes()) &&(((empty() || VL.size() == getNumLanes()) && "Expected same number of lanes"
) ? static_cast<void> (0) : __assert_fail ("(empty() || VL.size() == getNumLanes()) && \"Expected same number of lanes\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1109, __PRETTY_FUNCTION__))
1109 "Expected same number of lanes")(((empty() || VL.size() == getNumLanes()) && "Expected same number of lanes"
) ? static_cast<void> (0) : __assert_fail ("(empty() || VL.size() == getNumLanes()) && \"Expected same number of lanes\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1109, __PRETTY_FUNCTION__))
;
1110 assert(isa<Instruction>(VL[0]) && "Expected instruction")((isa<Instruction>(VL[0]) && "Expected instruction"
) ? static_cast<void> (0) : __assert_fail ("isa<Instruction>(VL[0]) && \"Expected instruction\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1110, __PRETTY_FUNCTION__))
;
1111 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1112 OpsVec.resize(NumOperands);
1113 unsigned NumLanes = VL.size();
1114 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1115 OpsVec[OpIdx].resize(NumLanes);
1116 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1117 assert(isa<Instruction>(VL[Lane]) && "Expected instruction")((isa<Instruction>(VL[Lane]) && "Expected instruction"
) ? static_cast<void> (0) : __assert_fail ("isa<Instruction>(VL[Lane]) && \"Expected instruction\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1117, __PRETTY_FUNCTION__))
;
1118 // Our tree has just 3 nodes: the root and two operands.
1119 // It is therefore trivial to get the APO. We only need to check the
1120 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
1121 // RHS operand. The LHS operand of both add and sub is never attached
1122 // to an inversese operation in the linearized form, therefore its APO
1123 // is false. The RHS is true only if VL[Lane] is an inverse operation.
1124
1125 // Since operand reordering is performed on groups of commutative
1126 // operations or alternating sequences (e.g., +, -), we can safely
1127 // tell the inverse operations by checking commutativity.
1128 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
1129 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
1130 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1131 APO, false};
1132 }
1133 }
1134 }
1135
1136 /// \returns the number of operands.
1137 unsigned getNumOperands() const { return OpsVec.size(); }
1138
1139 /// \returns the number of lanes.
1140 unsigned getNumLanes() const { return OpsVec[0].size(); }
1141
1142 /// \returns the operand value at \p OpIdx and \p Lane.
1143 Value *getValue(unsigned OpIdx, unsigned Lane) const {
1144 return getData(OpIdx, Lane).V;
1145 }
1146
1147 /// \returns true if the data structure is empty.
1148 bool empty() const { return OpsVec.empty(); }
1149
1150 /// Clears the data.
1151 void clear() { OpsVec.clear(); }
1152
1153 /// \Returns true if there are enough operands identical to \p Op to fill
1154 /// the whole vector.
1155 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
1156 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
1157 bool OpAPO = getData(OpIdx, Lane).APO;
1158 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1159 if (Ln == Lane)
1160 continue;
1161 // This is set to true if we found a candidate for broadcast at Lane.
1162 bool FoundCandidate = false;
1163 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1164 OperandData &Data = getData(OpI, Ln);
1165 if (Data.APO != OpAPO || Data.IsUsed)
1166 continue;
1167 if (Data.V == Op) {
1168 FoundCandidate = true;
1169 Data.IsUsed = true;
1170 break;
1171 }
1172 }
1173 if (!FoundCandidate)
1174 return false;
1175 }
1176 return true;
1177 }
1178
1179 public:
1180 /// Initialize with all the operands of the instruction vector \p RootVL.
1181 VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
1182 ScalarEvolution &SE, const BoUpSLP &R)
1183 : DL(DL), SE(SE), R(R) {
1184 // Append all the operands of RootVL.
1185 appendOperandsOfVL(RootVL);
1186 }
1187
1188 /// \Returns a value vector with the operands across all lanes for the
1189 /// opearnd at \p OpIdx.
1190 ValueList getVL(unsigned OpIdx) const {
1191 ValueList OpVL(OpsVec[OpIdx].size());
1192 assert(OpsVec[OpIdx].size() == getNumLanes() &&((OpsVec[OpIdx].size() == getNumLanes() && "Expected same num of lanes across all operands"
) ? static_cast<void> (0) : __assert_fail ("OpsVec[OpIdx].size() == getNumLanes() && \"Expected same num of lanes across all operands\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1193, __PRETTY_FUNCTION__))
1193 "Expected same num of lanes across all operands")((OpsVec[OpIdx].size() == getNumLanes() && "Expected same num of lanes across all operands"
) ? static_cast<void> (0) : __assert_fail ("OpsVec[OpIdx].size() == getNumLanes() && \"Expected same num of lanes across all operands\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1193, __PRETTY_FUNCTION__))
;
1194 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
1195 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
1196 return OpVL;
1197 }
1198
1199 // Performs operand reordering for 2 or more operands.
1200 // The original operands are in OrigOps[OpIdx][Lane].
1201 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
1202 void reorder() {
1203 unsigned NumOperands = getNumOperands();
1204 unsigned NumLanes = getNumLanes();
1205 // Each operand has its own mode. We are using this mode to help us select
1206 // the instructions for each lane, so that they match best with the ones
1207 // we have selected so far.
1208 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
1209
1210 // This is a greedy single-pass algorithm. We are going over each lane
1211 // once and deciding on the best order right away with no back-tracking.
1212 // However, in order to increase its effectiveness, we start with the lane
1213 // that has operands that can move the least. For example, given the
1214 // following lanes:
1215 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
1216 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
1217 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
1218 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
1219 // we will start at Lane 1, since the operands of the subtraction cannot
1220 // be reordered. Then we will visit the rest of the lanes in a circular
1221 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
1222
1223 // Find the first lane that we will start our search from.
1224 unsigned FirstLane = getBestLaneToStartReordering();
1225
1226 // Initialize the modes.
1227 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1228 Value *OpLane0 = getValue(OpIdx, FirstLane);
1229 // Keep track if we have instructions with all the same opcode on one
1230 // side.
1231 if (isa<LoadInst>(OpLane0))
1232 ReorderingModes[OpIdx] = ReorderingMode::Load;
1233 else if (isa<Instruction>(OpLane0)) {
1234 // Check if OpLane0 should be broadcast.
1235 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
1236 ReorderingModes[OpIdx] = ReorderingMode::Splat;
1237 else
1238 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
1239 }
1240 else if (isa<Constant>(OpLane0))
1241 ReorderingModes[OpIdx] = ReorderingMode::Constant;
1242 else if (isa<Argument>(OpLane0))
1243 // Our best hope is a Splat. It may save some cost in some cases.
1244 ReorderingModes[OpIdx] = ReorderingMode::Splat;
1245 else
1246 // NOTE: This should be unreachable.
1247 ReorderingModes[OpIdx] = ReorderingMode::Failed;
1248 }
1249
1250 // If the initial strategy fails for any of the operand indexes, then we
1251 // perform reordering again in a second pass. This helps avoid assigning
1252 // high priority to the failed strategy, and should improve reordering for
1253 // the non-failed operand indexes.
1254 for (int Pass = 0; Pass != 2; ++Pass) {
1255 // Skip the second pass if the first pass did not fail.
1256 bool StrategyFailed = false;
1257 // Mark all operand data as free to use.
1258 clearUsed();
1259 // We keep the original operand order for the FirstLane, so reorder the
1260 // rest of the lanes. We are visiting the nodes in a circular fashion,
1261 // using FirstLane as the center point and increasing the radius
1262 // distance.
1263 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
1264 // Visit the lane on the right and then the lane on the left.
1265 for (int Direction : {+1, -1}) {
1266 int Lane = FirstLane + Direction * Distance;
1267 if (Lane < 0 || Lane >= (int)NumLanes)
1268 continue;
1269 int LastLane = Lane - Direction;
1270 assert(LastLane >= 0 && LastLane < (int)NumLanes &&((LastLane >= 0 && LastLane < (int)NumLanes &&
"Out of bounds") ? static_cast<void> (0) : __assert_fail
("LastLane >= 0 && LastLane < (int)NumLanes && \"Out of bounds\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1271, __PRETTY_FUNCTION__))
1271 "Out of bounds")((LastLane >= 0 && LastLane < (int)NumLanes &&
"Out of bounds") ? static_cast<void> (0) : __assert_fail
("LastLane >= 0 && LastLane < (int)NumLanes && \"Out of bounds\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1271, __PRETTY_FUNCTION__))
;
1272 // Look for a good match for each operand.
1273 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1274 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
1275 Optional<unsigned> BestIdx =
1276 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
1277 // By not selecting a value, we allow the operands that follow to
1278 // select a better matching value. We will get a non-null value in
1279 // the next run of getBestOperand().
1280 if (BestIdx) {
1281 // Swap the current operand with the one returned by
1282 // getBestOperand().
1283 swap(OpIdx, BestIdx.getValue(), Lane);
1284 } else {
1285 // We failed to find a best operand, set mode to 'Failed'.
1286 ReorderingModes[OpIdx] = ReorderingMode::Failed;
1287 // Enable the second pass.
1288 StrategyFailed = true;
1289 }
1290 }
1291 }
1292 }
1293 // Skip second pass if the strategy did not fail.
1294 if (!StrategyFailed)
1295 break;
1296 }
1297 }
1298
1299#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1300 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static StringRef getModeStr(ReorderingMode RMode) {
1301 switch (RMode) {
1302 case ReorderingMode::Load:
1303 return "Load";
1304 case ReorderingMode::Opcode:
1305 return "Opcode";
1306 case ReorderingMode::Constant:
1307 return "Constant";
1308 case ReorderingMode::Splat:
1309 return "Splat";
1310 case ReorderingMode::Failed:
1311 return "Failed";
1312 }
1313 llvm_unreachable("Unimplemented Reordering Type")::llvm::llvm_unreachable_internal("Unimplemented Reordering Type"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1313)
;
1314 }
1315
1316 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static raw_ostream &printMode(ReorderingMode RMode,
1317 raw_ostream &OS) {
1318 return OS << getModeStr(RMode);
1319 }
1320
1321 /// Debug print.
1322 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static void dumpMode(ReorderingMode RMode) {
1323 printMode(RMode, dbgs());
1324 }
1325
1326 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
1327 return printMode(RMode, OS);
1328 }
1329
1330 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) raw_ostream &print(raw_ostream &OS) const {
1331 const unsigned Indent = 2;
1332 unsigned Cnt = 0;
1333 for (const OperandDataVec &OpDataVec : OpsVec) {
1334 OS << "Operand " << Cnt++ << "\n";
1335 for (const OperandData &OpData : OpDataVec) {
1336 OS.indent(Indent) << "{";
1337 if (Value *V = OpData.V)
1338 OS << *V;
1339 else
1340 OS << "null";
1341 OS << ", APO:" << OpData.APO << "}\n";
1342 }
1343 OS << "\n";
1344 }
1345 return OS;
1346 }
1347
1348 /// Debug print.
1349 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { print(dbgs()); }
1350#endif
1351 };
1352
1353 /// Checks if the instruction is marked for deletion.
1354 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
1355
1356 /// Marks values operands for later deletion by replacing them with Undefs.
1357 void eraseInstructions(ArrayRef<Value *> AV);
1358
1359 ~BoUpSLP();
1360
1361private:
1362 /// Checks if all users of \p I are the part of the vectorization tree.
1363 bool areAllUsersVectorized(Instruction *I) const;
1364
1365 /// \returns the cost of the vectorizable entry.
1366 int getEntryCost(TreeEntry *E);
1367
1368 /// This is the recursive part of buildTree.
1369 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
1370 const EdgeInfo &EI);
1371
1372 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
1373 /// be vectorized to use the original vector (or aggregate "bitcast" to a
1374 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
1375 /// returns false, setting \p CurrentOrder to either an empty vector or a
1376 /// non-identity permutation that allows to reuse extract instructions.
1377 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
1378 SmallVectorImpl<unsigned> &CurrentOrder) const;
1379
1380 /// Vectorize a single entry in the tree.
1381 Value *vectorizeTree(TreeEntry *E);
1382
1383 /// Vectorize a single entry in the tree, starting in \p VL.
1384 Value *vectorizeTree(ArrayRef<Value *> VL);
1385
1386 /// \returns the scalarization cost for this type. Scalarization in this
1387 /// context means the creation of vectors from a group of scalars.
1388 int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const;
1389
1390 /// \returns the scalarization cost for this list of values. Assuming that
1391 /// this subtree gets vectorized, we may need to extract the values from the
1392 /// roots. This method calculates the cost of extracting the values.
1393 int getGatherCost(ArrayRef<Value *> VL) const;
1394
1395 /// Set the Builder insert point to one after the last instruction in
1396 /// the bundle
1397 void setInsertPointAfterBundle(TreeEntry *E);
1398
1399 /// \returns a vector from a collection of scalars in \p VL.
1400 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
1401
1402 /// \returns whether the VectorizableTree is fully vectorizable and will
1403 /// be beneficial even the tree height is tiny.
1404 bool isFullyVectorizableTinyTree() const;
1405
1406 /// Reorder commutative or alt operands to get better probability of
1407 /// generating vectorized code.
1408 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
1409 SmallVectorImpl<Value *> &Left,
1410 SmallVectorImpl<Value *> &Right,
1411 const DataLayout &DL,
1412 ScalarEvolution &SE,
1413 const BoUpSLP &R);
1414 struct TreeEntry {
1415 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
1416 TreeEntry(VecTreeTy &Container) : Container(Container) {}
1417
1418 /// \returns true if the scalars in VL are equal to this entry.
1419 bool isSame(ArrayRef<Value *> VL) const {
1420 if (VL.size() == Scalars.size())
1421 return std::equal(VL.begin(), VL.end(), Scalars.begin());
1422 return VL.size() == ReuseShuffleIndices.size() &&
1423 std::equal(
1424 VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
1425 [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
1426 }
1427
1428 /// A vector of scalars.
1429 ValueList Scalars;
1430
1431 /// The Scalars are vectorized into this value. It is initialized to Null.
1432 Value *VectorizedValue = nullptr;
1433
1434 /// Do we need to gather this sequence ?
1435 enum EntryState { Vectorize, NeedToGather };
1436 EntryState State;
1437
1438 /// Does this sequence require some shuffling?
1439 SmallVector<unsigned, 4> ReuseShuffleIndices;
1440
1441 /// Does this entry require reordering?
1442 ArrayRef<unsigned> ReorderIndices;
1443
1444 /// Points back to the VectorizableTree.
1445 ///
1446 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
1447 /// to be a pointer and needs to be able to initialize the child iterator.
1448 /// Thus we need a reference back to the container to translate the indices
1449 /// to entries.
1450 VecTreeTy &Container;
1451
1452 /// The TreeEntry index containing the user of this entry. We can actually
1453 /// have multiple users so the data structure is not truly a tree.
1454 SmallVector<EdgeInfo, 1> UserTreeIndices;
1455
1456 /// The index of this treeEntry in VectorizableTree.
1457 int Idx = -1;
1458
1459 private:
1460 /// The operands of each instruction in each lane Operands[op_index][lane].
1461 /// Note: This helps avoid the replication of the code that performs the
1462 /// reordering of operands during buildTree_rec() and vectorizeTree().
1463 SmallVector<ValueList, 2> Operands;
1464
1465 /// The main/alternate instruction.
1466 Instruction *MainOp = nullptr;
1467 Instruction *AltOp = nullptr;
1468
1469 public:
1470 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
1471 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
1472 if (Operands.size() < OpIdx + 1)
1473 Operands.resize(OpIdx + 1);
1474 assert(Operands[OpIdx].size() == 0 && "Already resized?")((Operands[OpIdx].size() == 0 && "Already resized?") ?
static_cast<void> (0) : __assert_fail ("Operands[OpIdx].size() == 0 && \"Already resized?\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1474, __PRETTY_FUNCTION__))
;
1475 Operands[OpIdx].resize(Scalars.size());
1476 for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
1477 Operands[OpIdx][Lane] = OpVL[Lane];
1478 }
1479
1480 /// Set the operands of this bundle in their original order.
1481 void setOperandsInOrder() {
1482 assert(Operands.empty() && "Already initialized?")((Operands.empty() && "Already initialized?") ? static_cast
<void> (0) : __assert_fail ("Operands.empty() && \"Already initialized?\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1482, __PRETTY_FUNCTION__))
;
1483 auto *I0 = cast<Instruction>(Scalars[0]);
1484 Operands.resize(I0->getNumOperands());
1485 unsigned NumLanes = Scalars.size();
1486 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
1487 OpIdx != NumOperands; ++OpIdx) {
1488 Operands[OpIdx].resize(NumLanes);
1489 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1490 auto *I = cast<Instruction>(Scalars[Lane]);
1491 assert(I->getNumOperands() == NumOperands &&((I->getNumOperands() == NumOperands && "Expected same number of operands"
) ? static_cast<void> (0) : __assert_fail ("I->getNumOperands() == NumOperands && \"Expected same number of operands\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1492, __PRETTY_FUNCTION__))
1492 "Expected same number of operands")((I->getNumOperands() == NumOperands && "Expected same number of operands"
) ? static_cast<void> (0) : __assert_fail ("I->getNumOperands() == NumOperands && \"Expected same number of operands\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1492, __PRETTY_FUNCTION__))
;
1493 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
1494 }
1495 }
1496 }
1497
1498 /// \returns the \p OpIdx operand of this TreeEntry.
1499 ValueList &getOperand(unsigned OpIdx) {
1500 assert(OpIdx < Operands.size() && "Off bounds")((OpIdx < Operands.size() && "Off bounds") ? static_cast
<void> (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1500, __PRETTY_FUNCTION__))
;
1501 return Operands[OpIdx];
1502 }
1503
1504 /// \returns the number of operands.
1505 unsigned getNumOperands() const { return Operands.size(); }
1506
1507 /// \return the single \p OpIdx operand.
1508 Value *getSingleOperand(unsigned OpIdx) const {
1509 assert(OpIdx < Operands.size() && "Off bounds")((OpIdx < Operands.size() && "Off bounds") ? static_cast
<void> (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1509, __PRETTY_FUNCTION__))
;
1510 assert(!Operands[OpIdx].empty() && "No operand available")((!Operands[OpIdx].empty() && "No operand available")
? static_cast<void> (0) : __assert_fail ("!Operands[OpIdx].empty() && \"No operand available\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1510, __PRETTY_FUNCTION__))
;
1511 return Operands[OpIdx][0];
1512 }
1513
1514 /// Some of the instructions in the list have alternate opcodes.
1515 bool isAltShuffle() const {
1516 return getOpcode() != getAltOpcode();
1517 }
1518
1519 bool isOpcodeOrAlt(Instruction *I) const {
1520 unsigned CheckedOpcode = I->getOpcode();
1521 return (getOpcode() == CheckedOpcode ||
1522 getAltOpcode() == CheckedOpcode);
1523 }
1524
1525 /// Chooses the correct key for scheduling data. If \p Op has the same (or
1526 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
1527 /// \p OpValue.
1528 Value *isOneOf(Value *Op) const {
1529 auto *I = dyn_cast<Instruction>(Op);
1530 if (I && isOpcodeOrAlt(I))
1531 return Op;
1532 return MainOp;
1533 }
1534
1535 void setOperations(const InstructionsState &S) {
1536 MainOp = S.MainOp;
1537 AltOp = S.AltOp;
1538 }
1539
1540 Instruction *getMainOp() const {
1541 return MainOp;
1542 }
1543
1544 Instruction *getAltOp() const {
1545 return AltOp;
1546 }
1547
1548 /// The main/alternate opcodes for the list of instructions.
1549 unsigned getOpcode() const {
1550 return MainOp ? MainOp->getOpcode() : 0;
1551 }
1552
1553 unsigned getAltOpcode() const {
1554 return AltOp ? AltOp->getOpcode() : 0;
1555 }
1556
1557 /// Update operations state of this entry if reorder occurred.
1558 bool updateStateIfReorder() {
1559 if (ReorderIndices.empty())
1560 return false;
1561 InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
1562 setOperations(S);
1563 return true;
1564 }
1565
1566#ifndef NDEBUG
1567 /// Debug printer.
1568 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const {
1569 dbgs() << Idx << ".\n";
1570 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
1571 dbgs() << "Operand " << OpI << ":\n";
1572 for (const Value *V : Operands[OpI])
1573 dbgs().indent(2) << *V << "\n";
1574 }
1575 dbgs() << "Scalars: \n";
1576 for (Value *V : Scalars)
1577 dbgs().indent(2) << *V << "\n";
1578 dbgs() << "State: ";
1579 switch (State) {
1580 case Vectorize:
1581 dbgs() << "Vectorize\n";
1582 break;
1583 case NeedToGather:
1584 dbgs() << "NeedToGather\n";
1585 break;
1586 }
1587 dbgs() << "MainOp: ";
1588 if (MainOp)
1589 dbgs() << *MainOp << "\n";
1590 else
1591 dbgs() << "NULL\n";
1592 dbgs() << "AltOp: ";
1593 if (AltOp)
1594 dbgs() << *AltOp << "\n";
1595 else
1596 dbgs() << "NULL\n";
1597 dbgs() << "VectorizedValue: ";
1598 if (VectorizedValue)
1599 dbgs() << *VectorizedValue << "\n";
1600 else
1601 dbgs() << "NULL\n";
1602 dbgs() << "ReuseShuffleIndices: ";
1603 if (ReuseShuffleIndices.empty())
1604 dbgs() << "Emtpy";
1605 else
1606 for (unsigned ReuseIdx : ReuseShuffleIndices)
1607 dbgs() << ReuseIdx << ", ";
1608 dbgs() << "\n";
1609 dbgs() << "ReorderIndices: ";
1610 for (unsigned ReorderIdx : ReorderIndices)
1611 dbgs() << ReorderIdx << ", ";
1612 dbgs() << "\n";
1613 dbgs() << "UserTreeIndices: ";
1614 for (const auto &EInfo : UserTreeIndices)
1615 dbgs() << EInfo << ", ";
1616 dbgs() << "\n";
1617 }
1618#endif
1619 };
1620
1621 /// Create a new VectorizableTree entry.
1622 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
1623 const InstructionsState &S,
1624 const EdgeInfo &UserTreeIdx,
1625 ArrayRef<unsigned> ReuseShuffleIndices = None,
1626 ArrayRef<unsigned> ReorderIndices = None) {
1627 bool Vectorized = (bool)Bundle;
1628 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
1629 TreeEntry *Last = VectorizableTree.back().get();
1630 Last->Idx = VectorizableTree.size() - 1;
1631 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
1632 Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
1633 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
1634 ReuseShuffleIndices.end());
1635 Last->ReorderIndices = ReorderIndices;
1636 Last->setOperations(S);
1637 if (Vectorized) {
1638 for (int i = 0, e = VL.size(); i != e; ++i) {
1639 assert(!getTreeEntry(VL[i]) && "Scalar already in tree!")((!getTreeEntry(VL[i]) && "Scalar already in tree!") ?
static_cast<void> (0) : __assert_fail ("!getTreeEntry(VL[i]) && \"Scalar already in tree!\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1639, __PRETTY_FUNCTION__))
;
1640 ScalarToTreeEntry[VL[i]] = Last;
1641 }
1642 // Update the scheduler bundle to point to this TreeEntry.
1643 unsigned Lane = 0;
1644 for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
1645 BundleMember = BundleMember->NextInBundle) {
1646 BundleMember->TE = Last;
1647 BundleMember->Lane = Lane;
1648 ++Lane;
1649 }
1650 assert((!Bundle.getValue() || Lane == VL.size()) &&(((!Bundle.getValue() || Lane == VL.size()) && "Bundle and VL out of sync"
) ? static_cast<void> (0) : __assert_fail ("(!Bundle.getValue() || Lane == VL.size()) && \"Bundle and VL out of sync\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1651, __PRETTY_FUNCTION__))
1651 "Bundle and VL out of sync")(((!Bundle.getValue() || Lane == VL.size()) && "Bundle and VL out of sync"
) ? static_cast<void> (0) : __assert_fail ("(!Bundle.getValue() || Lane == VL.size()) && \"Bundle and VL out of sync\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1651, __PRETTY_FUNCTION__))
;
1652 } else {
1653 MustGather.insert(VL.begin(), VL.end());
1654 }
1655
1656 if (UserTreeIdx.UserTE)
1657 Last->UserTreeIndices.push_back(UserTreeIdx);
1658
1659 return Last;
1660 }
1661
1662 /// -- Vectorization State --
1663 /// Holds all of the tree entries.
1664 TreeEntry::VecTreeTy VectorizableTree;
1665
1666#ifndef NDEBUG
1667 /// Debug printer.
1668 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dumpVectorizableTree() const {
1669 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
1670 VectorizableTree[Id]->dump();
1671 dbgs() << "\n";
1672 }
1673 }
1674#endif
1675
1676 TreeEntry *getTreeEntry(Value *V) {
1677 auto I = ScalarToTreeEntry.find(V);
1678 if (I != ScalarToTreeEntry.end())
1679 return I->second;
1680 return nullptr;
1681 }
1682
1683 const TreeEntry *getTreeEntry(Value *V) const {
1684 auto I = ScalarToTreeEntry.find(V);
1685 if (I != ScalarToTreeEntry.end())
1686 return I->second;
1687 return nullptr;
1688 }
1689
1690 /// Maps a specific scalar to its tree entry.
1691 SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
1692
1693 /// A list of scalars that we found that we need to keep as scalars.
1694 ValueSet MustGather;
1695
1696 /// This POD struct describes one external user in the vectorized tree.
1697 struct ExternalUser {
1698 ExternalUser(Value *S, llvm::User *U, int L)
1699 : Scalar(S), User(U), Lane(L) {}
1700
1701 // Which scalar in our function.
1702 Value *Scalar;
1703
1704 // Which user that uses the scalar.
1705 llvm::User *User;
1706
1707 // Which lane does the scalar belong to.
1708 int Lane;
1709 };
1710 using UserList = SmallVector<ExternalUser, 16>;
1711
1712 /// Checks if two instructions may access the same memory.
1713 ///
1714 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
1715 /// is invariant in the calling loop.
1716 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
1717 Instruction *Inst2) {
1718 // First check if the result is already in the cache.
1719 AliasCacheKey key = std::make_pair(Inst1, Inst2);
1720 Optional<bool> &result = AliasCache[key];
1721 if (result.hasValue()) {
1722 return result.getValue();
1723 }
1724 MemoryLocation Loc2 = getLocation(Inst2, AA);
1725 bool aliased = true;
1726 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
1727 // Do the alias check.
1728 aliased = AA->alias(Loc1, Loc2);
1729 }
1730 // Store the result in the cache.
1731 result = aliased;
1732 return aliased;
1733 }
1734
1735 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
1736
1737 /// Cache for alias results.
1738 /// TODO: consider moving this to the AliasAnalysis itself.
1739 DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
1740
1741 /// Removes an instruction from its block and eventually deletes it.
1742 /// It's like Instruction::eraseFromParent() except that the actual deletion
1743 /// is delayed until BoUpSLP is destructed.
1744 /// This is required to ensure that there are no incorrect collisions in the
1745 /// AliasCache, which can happen if a new instruction is allocated at the
1746 /// same address as a previously deleted instruction.
1747 void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
1748 auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
1749 It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
1750 }
1751
1752 /// Temporary store for deleted instructions. Instructions will be deleted
1753 /// eventually when the BoUpSLP is destructed.
1754 DenseMap<Instruction *, bool> DeletedInstructions;
1755
1756 /// A list of values that need to extracted out of the tree.
1757 /// This list holds pairs of (Internal Scalar : External User). External User
1758 /// can be nullptr, it means that this Internal Scalar will be used later,
1759 /// after vectorization.
1760 UserList ExternalUses;
1761
1762 /// Values used only by @llvm.assume calls.
1763 SmallPtrSet<const Value *, 32> EphValues;
1764
1765 /// Holds all of the instructions that we gathered.
1766 SetVector<Instruction *> GatherSeq;
1767
1768 /// A list of blocks that we are going to CSE.
1769 SetVector<BasicBlock *> CSEBlocks;
1770
1771 /// Contains all scheduling relevant data for an instruction.
1772 /// A ScheduleData either represents a single instruction or a member of an
1773 /// instruction bundle (= a group of instructions which is combined into a
1774 /// vector instruction).
1775 struct ScheduleData {
1776 // The initial value for the dependency counters. It means that the
1777 // dependencies are not calculated yet.
1778 enum { InvalidDeps = -1 };
1779
1780 ScheduleData() = default;
1781
1782 void init(int BlockSchedulingRegionID, Value *OpVal) {
1783 FirstInBundle = this;
1784 NextInBundle = nullptr;
1785 NextLoadStore = nullptr;
1786 IsScheduled = false;
1787 SchedulingRegionID = BlockSchedulingRegionID;
1788 UnscheduledDepsInBundle = UnscheduledDeps;
1789 clearDependencies();
1790 OpValue = OpVal;
1791 TE = nullptr;
1792 Lane = -1;
1793 }
1794
1795 /// Returns true if the dependency information has been calculated.
1796 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
1797
1798 /// Returns true for single instructions and for bundle representatives
1799 /// (= the head of a bundle).
1800 bool isSchedulingEntity() const { return FirstInBundle == this; }
1801
1802 /// Returns true if it represents an instruction bundle and not only a
1803 /// single instruction.
1804 bool isPartOfBundle() const {
1805 return NextInBundle != nullptr || FirstInBundle != this;
1806 }
1807
1808 /// Returns true if it is ready for scheduling, i.e. it has no more
1809 /// unscheduled depending instructions/bundles.
1810 bool isReady() const {
1811 assert(isSchedulingEntity() &&((isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? static_cast<void> (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1812, __PRETTY_FUNCTION__))
1812 "can't consider non-scheduling entity for ready list")((isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? static_cast<void> (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1812, __PRETTY_FUNCTION__))
;
1813 return UnscheduledDepsInBundle == 0 && !IsScheduled;
1814 }
1815
1816 /// Modifies the number of unscheduled dependencies, also updating it for
1817 /// the whole bundle.
1818 int incrementUnscheduledDeps(int Incr) {
1819 UnscheduledDeps += Incr;
1820 return FirstInBundle->UnscheduledDepsInBundle += Incr;
1821 }
1822
1823 /// Sets the number of unscheduled dependencies to the number of
1824 /// dependencies.
1825 void resetUnscheduledDeps() {
1826 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
1827 }
1828
1829 /// Clears all dependency information.
1830 void clearDependencies() {
1831 Dependencies = InvalidDeps;
1832 resetUnscheduledDeps();
1833 MemoryDependencies.clear();
1834 }
1835
1836 void dump(raw_ostream &os) const {
1837 if (!isSchedulingEntity()) {
1838 os << "/ " << *Inst;
1839 } else if (NextInBundle) {
1840 os << '[' << *Inst;
1841 ScheduleData *SD = NextInBundle;
1842 while (SD) {
1843 os << ';' << *SD->Inst;
1844 SD = SD->NextInBundle;
1845 }
1846 os << ']';
1847 } else {
1848 os << *Inst;
1849 }
1850 }
1851
1852 Instruction *Inst = nullptr;
1853
1854 /// Points to the head in an instruction bundle (and always to this for
1855 /// single instructions).
1856 ScheduleData *FirstInBundle = nullptr;
1857
1858 /// Single linked list of all instructions in a bundle. Null if it is a
1859 /// single instruction.
1860 ScheduleData *NextInBundle = nullptr;
1861
1862 /// Single linked list of all memory instructions (e.g. load, store, call)
1863 /// in the block - until the end of the scheduling region.
1864 ScheduleData *NextLoadStore = nullptr;
1865
1866 /// The dependent memory instructions.
1867 /// This list is derived on demand in calculateDependencies().
1868 SmallVector<ScheduleData *, 4> MemoryDependencies;
1869
1870 /// This ScheduleData is in the current scheduling region if this matches
1871 /// the current SchedulingRegionID of BlockScheduling.
1872 int SchedulingRegionID = 0;
1873
1874 /// Used for getting a "good" final ordering of instructions.
1875 int SchedulingPriority = 0;
1876
1877 /// The number of dependencies. Constitutes of the number of users of the
1878 /// instruction plus the number of dependent memory instructions (if any).
1879 /// This value is calculated on demand.
1880 /// If InvalidDeps, the number of dependencies is not calculated yet.
1881 int Dependencies = InvalidDeps;
1882
1883 /// The number of dependencies minus the number of dependencies of scheduled
1884 /// instructions. As soon as this is zero, the instruction/bundle gets ready
1885 /// for scheduling.
1886 /// Note that this is negative as long as Dependencies is not calculated.
1887 int UnscheduledDeps = InvalidDeps;
1888
1889 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
1890 /// single instructions.
1891 int UnscheduledDepsInBundle = InvalidDeps;
1892
1893 /// True if this instruction is scheduled (or considered as scheduled in the
1894 /// dry-run).
1895 bool IsScheduled = false;
1896
1897 /// Opcode of the current instruction in the schedule data.
1898 Value *OpValue = nullptr;
1899
1900 /// The TreeEntry that this instruction corresponds to.
1901 TreeEntry *TE = nullptr;
1902
1903 /// The lane of this node in the TreeEntry.
1904 int Lane = -1;
1905 };
1906
1907#ifndef NDEBUG
1908 friend inline raw_ostream &operator<<(raw_ostream &os,
1909 const BoUpSLP::ScheduleData &SD) {
1910 SD.dump(os);
1911 return os;
1912 }
1913#endif
1914
1915 friend struct GraphTraits<BoUpSLP *>;
1916 friend struct DOTGraphTraits<BoUpSLP *>;
1917
1918 /// Contains all scheduling data for a basic block.
1919 struct BlockScheduling {
1920 BlockScheduling(BasicBlock *BB)
1921 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
1922
1923 void clear() {
1924 ReadyInsts.clear();
1925 ScheduleStart = nullptr;
1926 ScheduleEnd = nullptr;
1927 FirstLoadStoreInRegion = nullptr;
1928 LastLoadStoreInRegion = nullptr;
1929
1930 // Reduce the maximum schedule region size by the size of the
1931 // previous scheduling run.
1932 ScheduleRegionSizeLimit -= ScheduleRegionSize;
1933 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
1934 ScheduleRegionSizeLimit = MinScheduleRegionSize;
1935 ScheduleRegionSize = 0;
1936
1937 // Make a new scheduling region, i.e. all existing ScheduleData is not
1938 // in the new region yet.
1939 ++SchedulingRegionID;
1940 }
1941
1942 ScheduleData *getScheduleData(Value *V) {
1943 ScheduleData *SD = ScheduleDataMap[V];
1944 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
1945 return SD;
1946 return nullptr;
1947 }
1948
1949 ScheduleData *getScheduleData(Value *V, Value *Key) {
1950 if (V == Key)
1951 return getScheduleData(V);
1952 auto I = ExtraScheduleDataMap.find(V);
1953 if (I != ExtraScheduleDataMap.end()) {
1954 ScheduleData *SD = I->second[Key];
1955 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
1956 return SD;
1957 }
1958 return nullptr;
1959 }
1960
1961 bool isInSchedulingRegion(ScheduleData *SD) const {
1962 return SD->SchedulingRegionID == SchedulingRegionID;
1963 }
1964
1965 /// Marks an instruction as scheduled and puts all dependent ready
1966 /// instructions into the ready-list.
1967 template <typename ReadyListType>
1968 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
1969 SD->IsScheduled = true;
1970 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule " << *SD <<
"\n"; } } while (false)
;
1971
1972 ScheduleData *BundleMember = SD;
1973 while (BundleMember) {
1974 if (BundleMember->Inst != BundleMember->OpValue) {
1975 BundleMember = BundleMember->NextInBundle;
1976 continue;
1977 }
1978 // Handle the def-use chain dependencies.
1979
1980 // Decrement the unscheduled counter and insert to ready list if ready.
1981 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
1982 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
1983 if (OpDef && OpDef->hasValidDependencies() &&
1984 OpDef->incrementUnscheduledDeps(-1) == 0) {
1985 // There are no more unscheduled dependencies after
1986 // decrementing, so we can put the dependent instruction
1987 // into the ready list.
1988 ScheduleData *DepBundle = OpDef->FirstInBundle;
1989 assert(!DepBundle->IsScheduled &&((!DepBundle->IsScheduled && "already scheduled bundle gets ready"
) ? static_cast<void> (0) : __assert_fail ("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1990, __PRETTY_FUNCTION__))
1990 "already scheduled bundle gets ready")((!DepBundle->IsScheduled && "already scheduled bundle gets ready"
) ? static_cast<void> (0) : __assert_fail ("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1990, __PRETTY_FUNCTION__))
;
1991 ReadyList.insert(DepBundle);
1992 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)
1993 << "SLP: gets ready (def): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)
;
1994 }
1995 });
1996 };
1997
1998 // If BundleMember is a vector bundle, its operands may have been
1999 // reordered duiring buildTree(). We therefore need to get its operands
2000 // through the TreeEntry.
2001 if (TreeEntry *TE = BundleMember->TE) {
2002 int Lane = BundleMember->Lane;
2003 assert(Lane >= 0 && "Lane not set")((Lane >= 0 && "Lane not set") ? static_cast<void
> (0) : __assert_fail ("Lane >= 0 && \"Lane not set\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2003, __PRETTY_FUNCTION__))
;
2004 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
2005 OpIdx != NumOperands; ++OpIdx)
2006 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
2007 DecrUnsched(I);
2008 } else {
2009 // If BundleMember is a stand-alone instruction, no operand reordering
2010 // has taken place, so we directly access its operands.
2011 for (Use &U : BundleMember->Inst->operands())
2012 if (auto *I = dyn_cast<Instruction>(U.get()))
2013 DecrUnsched(I);
2014 }
2015 // Handle the memory dependencies.
2016 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
2017 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
2018 // There are no more unscheduled dependencies after decrementing,
2019 // so we can put the dependent instruction into the ready list.
2020 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
2021 assert(!DepBundle->IsScheduled &&((!DepBundle->IsScheduled && "already scheduled bundle gets ready"
) ? static_cast<void> (0) : __assert_fail ("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2022, __PRETTY_FUNCTION__))
2022 "already scheduled bundle gets ready")((!DepBundle->IsScheduled && "already scheduled bundle gets ready"
) ? static_cast<void> (0) : __assert_fail ("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2022, __PRETTY_FUNCTION__))
;
2023 ReadyList.insert(DepBundle);
2024 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)
2025 << "SLP: gets ready (mem): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)
;
2026 }
2027 }
2028 BundleMember = BundleMember->NextInBundle;
2029 }
2030 }
2031
2032 void doForAllOpcodes(Value *V,
2033 function_ref<void(ScheduleData *SD)> Action) {
2034 if (ScheduleData *SD = getScheduleData(V))
2035 Action(SD);
2036 auto I = ExtraScheduleDataMap.find(V);
2037 if (I != ExtraScheduleDataMap.end())
2038 for (auto &P : I->second)
2039 if (P.second->SchedulingRegionID == SchedulingRegionID)
2040 Action(P.second);
2041 }
2042
2043 /// Put all instructions into the ReadyList which are ready for scheduling.
2044 template <typename ReadyListType>
2045 void initialFillReadyList(ReadyListType &ReadyList) {
2046 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2047 doForAllOpcodes(I, [&](ScheduleData *SD) {
2048 if (SD->isSchedulingEntity() && SD->isReady()) {
2049 ReadyList.insert(SD);
2050 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *I << "\n"; } } while (false)
2051 << "SLP: initially in ready list: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *I << "\n"; } } while (false)
;
2052 }
2053 });
2054 }
2055 }
2056
2057 /// Checks if a bundle of instructions can be scheduled, i.e. has no
2058 /// cyclic dependencies. This is only a dry-run, no instructions are
2059 /// actually moved at this stage.
2060 /// \returns the scheduling bundle. The returned Optional value is non-None
2061 /// if \p VL is allowed to be scheduled.
2062 Optional<ScheduleData *>
2063 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
2064 const InstructionsState &S);
2065
2066 /// Un-bundles a group of instructions.
2067 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
2068
2069 /// Allocates schedule data chunk.
2070 ScheduleData *allocateScheduleDataChunks();
2071
2072 /// Extends the scheduling region so that V is inside the region.
2073 /// \returns true if the region size is within the limit.
2074 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
2075
2076 /// Initialize the ScheduleData structures for new instructions in the
2077 /// scheduling region.
2078 void initScheduleData(Instruction *FromI, Instruction *ToI,
2079 ScheduleData *PrevLoadStore,
2080 ScheduleData *NextLoadStore);
2081
2082 /// Updates the dependency information of a bundle and of all instructions/
2083 /// bundles which depend on the original bundle.
2084 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
2085 BoUpSLP *SLP);
2086
2087 /// Sets all instruction in the scheduling region to un-scheduled.
2088 void resetSchedule();
2089
2090 BasicBlock *BB;
2091
2092 /// Simple memory allocation for ScheduleData.
2093 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
2094
2095 /// The size of a ScheduleData array in ScheduleDataChunks.
2096 int ChunkSize;
2097
2098 /// The allocator position in the current chunk, which is the last entry
2099 /// of ScheduleDataChunks.
2100 int ChunkPos;
2101
2102 /// Attaches ScheduleData to Instruction.
2103 /// Note that the mapping survives during all vectorization iterations, i.e.
2104 /// ScheduleData structures are recycled.
2105 DenseMap<Value *, ScheduleData *> ScheduleDataMap;
2106
2107 /// Attaches ScheduleData to Instruction with the leading key.
2108 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
2109 ExtraScheduleDataMap;
2110
2111 struct ReadyList : SmallVector<ScheduleData *, 8> {
2112 void insert(ScheduleData *SD) { push_back(SD); }
2113 };
2114
2115 /// The ready-list for scheduling (only used for the dry-run).
2116 ReadyList ReadyInsts;
2117
2118 /// The first instruction of the scheduling region.
2119 Instruction *ScheduleStart = nullptr;
2120
2121 /// The first instruction _after_ the scheduling region.
2122 Instruction *ScheduleEnd = nullptr;
2123
2124 /// The first memory accessing instruction in the scheduling region
2125 /// (can be null).
2126 ScheduleData *FirstLoadStoreInRegion = nullptr;
2127
2128 /// The last memory accessing instruction in the scheduling region
2129 /// (can be null).
2130 ScheduleData *LastLoadStoreInRegion = nullptr;
2131
2132 /// The current size of the scheduling region.
2133 int ScheduleRegionSize = 0;
2134
2135 /// The maximum size allowed for the scheduling region.
2136 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
2137
2138 /// The ID of the scheduling region. For a new vectorization iteration this
2139 /// is incremented which "removes" all ScheduleData from the region.
2140 // Make sure that the initial SchedulingRegionID is greater than the
2141 // initial SchedulingRegionID in ScheduleData (which is 0).
2142 int SchedulingRegionID = 1;
2143 };
2144
2145 /// Attaches the BlockScheduling structures to basic blocks.
2146 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
2147
2148 /// Performs the "real" scheduling. Done before vectorization is actually
2149 /// performed in a basic block.
2150 void scheduleBlock(BlockScheduling *BS);
2151
2152 /// List of users to ignore during scheduling and that don't need extracting.
2153 ArrayRef<Value *> UserIgnoreList;
2154
2155 using OrdersType = SmallVector<unsigned, 4>;
2156 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
2157 /// sorted SmallVectors of unsigned.
2158 struct OrdersTypeDenseMapInfo {
2159 static OrdersType getEmptyKey() {
2160 OrdersType V;
2161 V.push_back(~1U);
2162 return V;
2163 }
2164
2165 static OrdersType getTombstoneKey() {
2166 OrdersType V;
2167 V.push_back(~2U);
2168 return V;
2169 }
2170
2171 static unsigned getHashValue(const OrdersType &V) {
2172 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
2173 }
2174
2175 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
2176 return LHS == RHS;
2177 }
2178 };
2179
2180 /// Contains orders of operations along with the number of bundles that have
2181 /// operations in this order. It stores only those orders that require
2182 /// reordering, if reordering is not required it is counted using \a
2183 /// NumOpsWantToKeepOriginalOrder.
2184 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
2185 /// Number of bundles that do not require reordering.
2186 unsigned NumOpsWantToKeepOriginalOrder = 0;
2187
2188 // Analysis and block reference.
2189 Function *F;
2190 ScalarEvolution *SE;
2191 TargetTransformInfo *TTI;
2192 TargetLibraryInfo *TLI;
2193 AliasAnalysis *AA;
2194 LoopInfo *LI;
2195 DominatorTree *DT;
2196 AssumptionCache *AC;
2197 DemandedBits *DB;
2198 const DataLayout *DL;
2199 OptimizationRemarkEmitter *ORE;
2200
2201 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
2202 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
2203
2204 /// Instruction builder to construct the vectorized tree.
2205 IRBuilder<> Builder;
2206
2207 /// A map of scalar integer values to the smallest bit width with which they
2208 /// can legally be represented. The values map to (width, signed) pairs,
2209 /// where "width" indicates the minimum bit width and "signed" is True if the
2210 /// value must be signed-extended, rather than zero-extended, back to its
2211 /// original width.
2212 MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
2213};
2214
2215} // end namespace slpvectorizer
2216
2217template <> struct GraphTraits<BoUpSLP *> {
2218 using TreeEntry = BoUpSLP::TreeEntry;
2219
2220 /// NodeRef has to be a pointer per the GraphWriter.
2221 using NodeRef = TreeEntry *;
2222
2223 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
2224
2225 /// Add the VectorizableTree to the index iterator to be able to return
2226 /// TreeEntry pointers.
2227 struct ChildIteratorType
2228 : public iterator_adaptor_base<
2229 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
2230 ContainerTy &VectorizableTree;
2231
2232 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
2233 ContainerTy &VT)
2234 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
2235
2236 NodeRef operator*() { return I->UserTE; }
2237 };
2238
2239 static NodeRef getEntryNode(BoUpSLP &R) {
2240 return R.VectorizableTree[0].get();
2241 }
2242
2243 static ChildIteratorType child_begin(NodeRef N) {
2244 return {N->UserTreeIndices.begin(), N->Container};
2245 }
2246
2247 static ChildIteratorType child_end(NodeRef N) {
2248 return {N->UserTreeIndices.end(), N->Container};
2249 }
2250
2251 /// For the node iterator we just need to turn the TreeEntry iterator into a
2252 /// TreeEntry* iterator so that it dereferences to NodeRef.
2253 class nodes_iterator {
2254 using ItTy = ContainerTy::iterator;
2255 ItTy It;
2256
2257 public:
2258 nodes_iterator(const ItTy &It2) : It(It2) {}
2259 NodeRef operator*() { return It->get(); }
2260 nodes_iterator operator++() {
2261 ++It;
2262 return *this;
2263 }
2264 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
2265 };
2266
2267 static nodes_iterator nodes_begin(BoUpSLP *R) {
2268 return nodes_iterator(R->VectorizableTree.begin());
2269 }
2270
2271 static nodes_iterator nodes_end(BoUpSLP *R) {
2272 return nodes_iterator(R->VectorizableTree.end());
2273 }
2274
2275 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
2276};
2277
2278template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
2279 using TreeEntry = BoUpSLP::TreeEntry;
2280
2281 DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
2282
2283 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
2284 std::string Str;
2285 raw_string_ostream OS(Str);
2286 if (isSplat(Entry->Scalars)) {
2287 OS << "<splat> " << *Entry->Scalars[0];
2288 return Str;
2289 }
2290 for (auto V : Entry->Scalars) {
2291 OS << *V;
2292 if (std::any_of(
2293 R->ExternalUses.begin(), R->ExternalUses.end(),
2294 [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
2295 OS << " <extract>";
2296 OS << "\n";
2297 }
2298 return Str;
2299 }
2300
2301 static std::string getNodeAttributes(const TreeEntry *Entry,
2302 const BoUpSLP *) {
2303 if (Entry->State == TreeEntry::NeedToGather)
2304 return "color=red";
2305 return "";
2306 }
2307};
2308
2309} // end namespace llvm
2310
2311BoUpSLP::~BoUpSLP() {
2312 for (const auto &Pair : DeletedInstructions) {
2313 // Replace operands of ignored instructions with Undefs in case if they were
2314 // marked for deletion.
2315 if (Pair.getSecond()) {
2316 Value *Undef = UndefValue::get(Pair.getFirst()->getType());
2317 Pair.getFirst()->replaceAllUsesWith(Undef);
2318 }
2319 Pair.getFirst()->dropAllReferences();
2320 }
2321 for (const auto &Pair : DeletedInstructions) {
2322 assert(Pair.getFirst()->use_empty() &&((Pair.getFirst()->use_empty() && "trying to erase instruction with users."
) ? static_cast<void> (0) : __assert_fail ("Pair.getFirst()->use_empty() && \"trying to erase instruction with users.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2323, __PRETTY_FUNCTION__))
2323 "trying to erase instruction with users.")((Pair.getFirst()->use_empty() && "trying to erase instruction with users."
) ? static_cast<void> (0) : __assert_fail ("Pair.getFirst()->use_empty() && \"trying to erase instruction with users.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2323, __PRETTY_FUNCTION__))
;
2324 Pair.getFirst()->eraseFromParent();
2325 }
2326}
2327
2328void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
2329 for (auto *V : AV) {
2330 if (auto *I = dyn_cast<Instruction>(V))
2331 eraseInstruction(I, /*ReplaceWithUndef=*/true);
2332 };
2333}
2334
2335void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
2336 ArrayRef<Value *> UserIgnoreLst) {
2337 ExtraValueToDebugLocsMap ExternallyUsedValues;
2338 buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
2339}
2340
2341void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
2342 ExtraValueToDebugLocsMap &ExternallyUsedValues,
2343 ArrayRef<Value *> UserIgnoreLst) {
2344 deleteTree();
2345 UserIgnoreList = UserIgnoreLst;
2346 if (!allSameType(Roots))
2347 return;
2348 buildTree_rec(Roots, 0, EdgeInfo());
2349
2350 // Collect the values that we need to extract from the tree.
2351 for (auto &TEPtr : VectorizableTree) {
2352 TreeEntry *Entry = TEPtr.get();
2353
2354 // No need to handle users of gathered values.
2355 if (Entry->State == TreeEntry::NeedToGather)
2356 continue;
2357
2358 // For each lane:
2359 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2360 Value *Scalar = Entry->Scalars[Lane];
2361 int FoundLane = Lane;
2362 if (!Entry->ReuseShuffleIndices.empty()) {
2363 FoundLane =
2364 std::distance(Entry->ReuseShuffleIndices.begin(),
2365 llvm::find(Entry->ReuseShuffleIndices, FoundLane));
2366 }
2367
2368 // Check if the scalar is externally used as an extra arg.
2369 auto ExtI = ExternallyUsedValues.find(Scalar);
2370 if (ExtI != ExternallyUsedValues.end()) {
2371 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)
2372 << Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)
;
2373 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
2374 }
2375 for (User *U : Scalar->users()) {
2376 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Checking user:" << *U <<
".\n"; } } while (false)
;
2377
2378 Instruction *UserInst = dyn_cast<Instruction>(U);
2379 if (!UserInst)
2380 continue;
2381
2382 // Skip in-tree scalars that become vectors
2383 if (TreeEntry *UseEntry = getTreeEntry(U)) {
2384 Value *UseScalar = UseEntry->Scalars[0];
2385 // Some in-tree scalars will remain as scalar in vectorized
2386 // instructions. If that is the case, the one in Lane 0 will
2387 // be used.
2388 if (UseScalar != U ||
2389 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
2390 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *Udo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)
2391 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)
;
2392 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state")((UseEntry->State != TreeEntry::NeedToGather && "Bad state"
) ? static_cast<void> (0) : __assert_fail ("UseEntry->State != TreeEntry::NeedToGather && \"Bad state\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2392, __PRETTY_FUNCTION__))
;
2393 continue;
2394 }
2395 }
2396
2397 // Ignore users in the user ignore list.
2398 if (is_contained(UserIgnoreList, UserInst))
2399 continue;
2400
2401 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)
2402 << Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)
;
2403 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
2404 }
2405 }
2406 }
2407}
2408
2409void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
2410 const EdgeInfo &UserTreeIdx) {
2411 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!")(((allConstant(VL) || allSameType(VL)) && "Invalid types!"
) ? static_cast<void> (0) : __assert_fail ("(allConstant(VL) || allSameType(VL)) && \"Invalid types!\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2411, __PRETTY_FUNCTION__))
;
2412
2413 InstructionsState S = getSameOpcode(VL);
2414 if (Depth == RecursionMaxDepth) {
2415 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to max recursion depth.\n"
; } } while (false)
;
2416 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2417 return;
2418 }
2419
2420 // Don't handle vectors.
2421 if (S.OpValue->getType()->isVectorTy()) {
2422 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to vector type.\n"
; } } while (false)
;
2423 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2424 return;
2425 }
2426
2427 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
2428 if (SI->getValueOperand()->getType()->isVectorTy()) {
2429 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to store vector type.\n"
; } } while (false)
;
2430 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2431 return;
2432 }
2433
2434 // If all of the operands are identical or constant we have a simple solution.
2435 if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
2436 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to C,S,B,O. \n"
; } } while (false)
;
2437 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2438 return;
2439 }
2440
2441 // We now know that this is a vector of instructions of the same type from
2442 // the same block.
2443
2444 // Don't vectorize ephemeral values.
2445 for (Value *V : VL) {
2446 if (EphValues.count(V)) {
2447 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is ephemeral.\n"; } } while (false)
2448 << ") is ephemeral.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is ephemeral.\n"; } } while (false)
;
2449 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2450 return;
2451 }
2452 }
2453
2454 // Check if this is a duplicate of another entry.
2455 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
2456 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tChecking bundle: " <<
*S.OpValue << ".\n"; } } while (false)
;
2457 if (!E->isSame(VL)) {
2458 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to partial overlap.\n"
; } } while (false)
;
2459 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2460 return;
2461 }
2462 // Record the reuse of the tree node. FIXME, currently this is only used to
2463 // properly draw the graph rather than for the actual vectorization.
2464 E->UserTreeIndices.push_back(UserTreeIdx);
2465 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValuedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*S.OpValue << ".\n"; } } while (false)
2466 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*S.OpValue << ".\n"; } } while (false)
;
2467 return;
2468 }
2469
2470 // Check that none of the instructions in the bundle are already in the tree.
2471 for (Value *V : VL) {
2472 auto *I = dyn_cast<Instruction>(V);
2473 if (!I)
2474 continue;
2475 if (getTreeEntry(I)) {
2476 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is already in tree.\n"; } } while (false)
2477 << ") is already in tree.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is already in tree.\n"; } } while (false)
;
2478 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2479 return;
2480 }
2481 }
2482
2483 // If any of the scalars is marked as a value that needs to stay scalar, then
2484 // we need to gather the scalars.
2485 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
2486 for (Value *V : VL) {
2487 if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
2488 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to gathered scalar.\n"
; } } while (false)
;
2489 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2490 return;
2491 }
2492 }
2493
2494 // Check that all of the users of the scalars that we want to vectorize are
2495 // schedulable.
2496 auto *VL0 = cast<Instruction>(S.OpValue);
2497 BasicBlock *BB = VL0->getParent();
2498
2499 if (!DT->isReachableFromEntry(BB)) {
2500 // Don't go into unreachable blocks. They may contain instructions with
2501 // dependency cycles which confuse the final scheduling.
2502 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle in unreachable block.\n"
; } } while (false)
;
2503 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2504 return;
2505 }
2506
2507 // Check that every instruction appears once in this bundle.
2508 SmallVector<unsigned, 4> ReuseShuffleIndicies;
2509 SmallVector<Value *, 4> UniqueValues;
2510 DenseMap<Value *, unsigned> UniquePositions;
2511 for (Value *V : VL) {
2512 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
2513 ReuseShuffleIndicies.emplace_back(Res.first->second);
2514 if (Res.second)
2515 UniqueValues.emplace_back(V);
2516 }
2517 size_t NumUniqueScalarValues = UniqueValues.size();
2518 if (NumUniqueScalarValues == VL.size()) {
2519 ReuseShuffleIndicies.clear();
2520 } else {
2521 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Shuffle for reused scalars.\n"
; } } while (false)
;
2522 if (NumUniqueScalarValues <= 1 ||
2523 !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
2524 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Scalar used twice in bundle.\n"
; } } while (false)
;
2525 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2526 return;
2527 }
2528 VL = UniqueValues;
2529 }
2530
2531 auto &BSRef = BlocksSchedules[BB];
2532 if (!BSRef)
2533 BSRef = std::make_unique<BlockScheduling>(BB);
2534
2535 BlockScheduling &BS = *BSRef.get();
2536
2537 Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
2538 if (!Bundle) {
2539 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are not able to schedule this bundle!\n"
; } } while (false)
;
2540 assert((!BS.getScheduleData(VL0) ||(((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle
()) && "tryScheduleBundle should cancelScheduling on failure"
) ? static_cast<void> (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2542, __PRETTY_FUNCTION__))
2541 !BS.getScheduleData(VL0)->isPartOfBundle()) &&(((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle
()) && "tryScheduleBundle should cancelScheduling on failure"
) ? static_cast<void> (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2542, __PRETTY_FUNCTION__))
2542 "tryScheduleBundle should cancelScheduling on failure")(((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle
()) && "tryScheduleBundle should cancelScheduling on failure"
) ? static_cast<void> (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2542, __PRETTY_FUNCTION__))
;
2543 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2544 ReuseShuffleIndicies);
2545 return;
2546 }
2547 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are able to schedule this bundle.\n"
; } } while (false)
;
2548
2549 unsigned ShuffleOrOp = S.isAltShuffle() ?
2550 (unsigned) Instruction::ShuffleVector : S.getOpcode();
2551 switch (ShuffleOrOp) {
2552 case Instruction::PHI: {
2553 auto *PH = cast<PHINode>(VL0);
2554
2555 // Check for terminator values (e.g. invoke).
2556 for (unsigned j = 0; j < VL.size(); ++j)
2557 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2558 Instruction *Term = dyn_cast<Instruction>(
2559 cast<PHINode>(VL[j])->getIncomingValueForBlock(
2560 PH->getIncomingBlock(i)));
2561 if (Term && Term->isTerminator()) {
2562 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"
; } } while (false)
2563 << "SLP: Need to swizzle PHINodes (terminator use).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"
; } } while (false)
;
2564 BS.cancelScheduling(VL, VL0);
2565 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2566 ReuseShuffleIndicies);
2567 return;
2568 }
2569 }
2570
2571 TreeEntry *TE =
2572 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
2573 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of PHINodes.\n"
; } } while (false)
;
2574
2575 // Keeps the reordered operands to avoid code duplication.
2576 SmallVector<ValueList, 2> OperandsVec;
2577 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2578 ValueList Operands;
2579 // Prepare the operand vector.
2580 for (Value *j : VL)
2581 Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
2582 PH->getIncomingBlock(i)));
2583 TE->setOperand(i, Operands);
2584 OperandsVec.push_back(Operands);
2585 }
2586 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
2587 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
2588 return;
2589 }
2590 case Instruction::ExtractValue:
2591 case Instruction::ExtractElement: {
2592 OrdersType CurrentOrder;
2593 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
2594 if (Reuse) {
2595 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Reusing or shuffling extract sequence.\n"
; } } while (false)
;
2596 ++NumOpsWantToKeepOriginalOrder;
2597 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2598 ReuseShuffleIndicies);
2599 // This is a special case, as it does not gather, but at the same time
2600 // we are not extending buildTree_rec() towards the operands.
2601 ValueList Op0;
2602 Op0.assign(VL.size(), VL0->getOperand(0));
2603 VectorizableTree.back()->setOperand(0, Op0);
2604 return;
2605 }
2606 if (!CurrentOrder.empty()) {
2607 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
2608 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
2609 "with order";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
2610 for (unsigned Idx : CurrentOrder)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
2611 dbgs() << " " << Idx;do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
2612 dbgs() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
2613 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
;
2614 // Insert new order with initial value 0, if it does not exist,
2615 // otherwise return the iterator to the existing one.
2616 auto StoredCurrentOrderAndNum =
2617 NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
2618 ++StoredCurrentOrderAndNum->getSecond();
2619 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2620 ReuseShuffleIndicies,
2621 StoredCurrentOrderAndNum->getFirst());
2622 // This is a special case, as it does not gather, but at the same time
2623 // we are not extending buildTree_rec() towards the operands.
2624 ValueList Op0;
2625 Op0.assign(VL.size(), VL0->getOperand(0));
2626 VectorizableTree.back()->setOperand(0, Op0);
2627 return;
2628 }
2629 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gather extract sequence.\n";
} } while (false)
;
2630 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2631 ReuseShuffleIndicies);
2632 BS.cancelScheduling(VL, VL0);
2633 return;
2634 }
2635 case Instruction::Load: {
2636 // Check that a vectorized load would load the same memory as a scalar
2637 // load. For example, we don't want to vectorize loads that are smaller
2638 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
2639 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
2640 // from such a struct, we read/write packed bits disagreeing with the
2641 // unvectorized version.
2642 Type *ScalarTy = VL0->getType();
2643
2644 if (DL->getTypeSizeInBits(ScalarTy) !=
2645 DL->getTypeAllocSizeInBits(ScalarTy)) {
2646 BS.cancelScheduling(VL, VL0);
2647 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2648 ReuseShuffleIndicies);
2649 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering loads of non-packed type.\n"
; } } while (false)
;
2650 return;
2651 }
2652
2653 // Make sure all loads in the bundle are simple - we can't vectorize
2654 // atomic or volatile loads.
2655 SmallVector<Value *, 4> PointerOps(VL.size());
2656 auto POIter = PointerOps.begin();
2657 for (Value *V : VL) {
2658 auto *L = cast<LoadInst>(V);
2659 if (!L->isSimple()) {
2660 BS.cancelScheduling(VL, VL0);
2661 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2662 ReuseShuffleIndicies);
2663 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple loads.\n"
; } } while (false)
;
2664 return;
2665 }
2666 *POIter = L->getPointerOperand();
2667 ++POIter;
2668 }
2669
2670 OrdersType CurrentOrder;
2671 // Check the order of pointer operands.
2672 if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
2673 Value *Ptr0;
2674 Value *PtrN;
2675 if (CurrentOrder.empty()) {
2676 Ptr0 = PointerOps.front();
2677 PtrN = PointerOps.back();
2678 } else {
2679 Ptr0 = PointerOps[CurrentOrder.front()];
2680 PtrN = PointerOps[CurrentOrder.back()];
2681 }
2682 const SCEV *Scev0 = SE->getSCEV(Ptr0);
2683 const SCEV *ScevN = SE->getSCEV(PtrN);
2684 const auto *Diff =
2685 dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
2686 uint64_t Size = DL->getTypeAllocSize(ScalarTy);
2687 // Check that the sorted loads are consecutive.
2688 if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
2689 if (CurrentOrder.empty()) {
2690 // Original loads are consecutive and does not require reordering.
2691 ++NumOpsWantToKeepOriginalOrder;
2692 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
2693 UserTreeIdx, ReuseShuffleIndicies);
2694 TE->setOperandsInOrder();
2695 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of loads.\n";
} } while (false)
;
2696 } else {
2697 // Need to reorder.
2698 auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
2699 ++I->getSecond();
2700 TreeEntry *TE =
2701 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2702 ReuseShuffleIndicies, I->getFirst());
2703 TE->setOperandsInOrder();
2704 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of jumbled loads.\n"
; } } while (false)
;
2705 }
2706 return;
2707 }
2708 }
2709
2710 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-consecutive loads.\n"
; } } while (false)
;
2711 BS.cancelScheduling(VL, VL0);
2712 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2713 ReuseShuffleIndicies);
2714 return;
2715 }
2716 case Instruction::ZExt:
2717 case Instruction::SExt:
2718 case Instruction::FPToUI:
2719 case Instruction::FPToSI:
2720 case Instruction::FPExt:
2721 case Instruction::PtrToInt:
2722 case Instruction::IntToPtr:
2723 case Instruction::SIToFP:
2724 case Instruction::UIToFP:
2725 case Instruction::Trunc:
2726 case Instruction::FPTrunc:
2727 case Instruction::BitCast: {
2728 Type *SrcTy = VL0->getOperand(0)->getType();
2729 for (Value *V : VL) {
2730 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
2731 if (Ty != SrcTy || !isValidElementType(Ty)) {
2732 BS.cancelScheduling(VL, VL0);
2733 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2734 ReuseShuffleIndicies);
2735 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false)
2736 << "SLP: Gathering casts with different src types.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false)
;
2737 return;
2738 }
2739 }
2740 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2741 ReuseShuffleIndicies);
2742 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of casts.\n";
} } while (false)
;
2743
2744 TE->setOperandsInOrder();
2745 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
2746 ValueList Operands;
2747 // Prepare the operand vector.
2748 for (Value *V : VL)
2749 Operands.push_back(cast<Instruction>(V)->getOperand(i));
2750
2751 buildTree_rec(Operands, Depth + 1, {TE, i});
2752 }
2753 return;
2754 }
2755 case Instruction::ICmp:
2756 case Instruction::FCmp: {
2757 // Check that all of the compares have the same predicate.
2758 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2759 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
2760 Type *ComparedTy = VL0->getOperand(0)->getType();
2761 for (Value *V : VL) {
2762 CmpInst *Cmp = cast<CmpInst>(V);
2763 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
2764 Cmp->getOperand(0)->getType() != ComparedTy) {
2765 BS.cancelScheduling(VL, VL0);
2766 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2767 ReuseShuffleIndicies);
2768 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false)
2769 << "SLP: Gathering cmp with different predicate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false)
;
2770 return;
2771 }
2772 }
2773
2774 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2775 ReuseShuffleIndicies);
2776 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of compares.\n"
; } } while (false)
;
2777
2778 ValueList Left, Right;
2779 if (cast<CmpInst>(VL0)->isCommutative()) {
2780 // Commutative predicate - collect + sort operands of the instructions
2781 // so that each side is more likely to have the same opcode.
2782 assert(P0 == SwapP0 && "Commutative Predicate mismatch")((P0 == SwapP0 && "Commutative Predicate mismatch") ?
static_cast<void> (0) : __assert_fail ("P0 == SwapP0 && \"Commutative Predicate mismatch\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2782, __PRETTY_FUNCTION__))
;
2783 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
2784 } else {
2785 // Collect operands - commute if it uses the swapped predicate.
2786 for (Value *V : VL) {
2787 auto *Cmp = cast<CmpInst>(V);
2788 Value *LHS = Cmp->getOperand(0);
2789 Value *RHS = Cmp->getOperand(1);
2790 if (Cmp->getPredicate() != P0)
2791 std::swap(LHS, RHS);
2792 Left.push_back(LHS);
2793 Right.push_back(RHS);
2794 }
2795 }
2796 TE->setOperand(0, Left);
2797 TE->setOperand(1, Right);
2798 buildTree_rec(Left, Depth + 1, {TE, 0});
2799 buildTree_rec(Right, Depth + 1, {TE, 1});
2800 return;
2801 }
2802 case Instruction::Select:
2803 case Instruction::FNeg:
2804 case Instruction::Add:
2805 case Instruction::FAdd:
2806 case Instruction::Sub:
2807 case Instruction::FSub:
2808 case Instruction::Mul:
2809 case Instruction::FMul:
2810 case Instruction::UDiv:
2811 case Instruction::SDiv:
2812 case Instruction::FDiv:
2813 case Instruction::URem:
2814 case Instruction::SRem:
2815 case Instruction::FRem:
2816 case Instruction::Shl:
2817 case Instruction::LShr:
2818 case Instruction::AShr:
2819 case Instruction::And:
2820 case Instruction::Or:
2821 case Instruction::Xor: {
2822 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2823 ReuseShuffleIndicies);
2824 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of un/bin op.\n"
; } } while (false)
;
2825
2826 // Sort operands of the instructions so that each side is more likely to
2827 // have the same opcode.
2828 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
2829 ValueList Left, Right;
2830 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
2831 TE->setOperand(0, Left);
2832 TE->setOperand(1, Right);
2833 buildTree_rec(Left, Depth + 1, {TE, 0});
2834 buildTree_rec(Right, Depth + 1, {TE, 1});
2835 return;
2836 }
2837
2838 TE->setOperandsInOrder();
2839 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
2840 ValueList Operands;
2841 // Prepare the operand vector.
2842 for (Value *j : VL)
2843 Operands.push_back(cast<Instruction>(j)->getOperand(i));
2844
2845 buildTree_rec(Operands, Depth + 1, {TE, i});
2846 }
2847 return;
2848 }
2849 case Instruction::GetElementPtr: {
2850 // We don't combine GEPs with complicated (nested) indexing.
2851 for (Value *V : VL) {
2852 if (cast<Instruction>(V)->getNumOperands() != 2) {
2853 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"
; } } while (false)
;
2854 BS.cancelScheduling(VL, VL0);
2855 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2856 ReuseShuffleIndicies);
2857 return;
2858 }
2859 }
2860
2861 // We can't combine several GEPs into one vector if they operate on
2862 // different types.
2863 Type *Ty0 = VL0->getOperand(0)->getType();
2864 for (Value *V : VL) {
2865 Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
2866 if (Ty0 != CurTy) {
2867 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false)
2868 << "SLP: not-vectorizable GEP (different types).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false)
;
2869 BS.cancelScheduling(VL, VL0);
2870 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2871 ReuseShuffleIndicies);
2872 return;
2873 }
2874 }
2875
2876 // We don't combine GEPs with non-constant indexes.
2877 Type *Ty1 = VL0->getOperand(1)->getType();
2878 for (Value *V : VL) {
2879 auto Op = cast<Instruction>(V)->getOperand(1);
2880 if (!isa<ConstantInt>(Op) ||
2881 (Op->getType() != Ty1 &&
2882 Op->getType()->getScalarSizeInBits() >
2883 DL->getIndexSizeInBits(
2884 V->getType()->getPointerAddressSpace()))) {
2885 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)
2886 << "SLP: not-vectorizable GEP (non-constant indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)
;
2887 BS.cancelScheduling(VL, VL0);
2888 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2889 ReuseShuffleIndicies);
2890 return;
2891 }
2892 }
2893
2894 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2895 ReuseShuffleIndicies);
2896 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of GEPs.\n"; }
} while (false)
;
2897 TE->setOperandsInOrder();
2898 for (unsigned i = 0, e = 2; i < e; ++i) {
2899 ValueList Operands;
2900 // Prepare the operand vector.
2901 for (Value *V : VL)
2902 Operands.push_back(cast<Instruction>(V)->getOperand(i));
2903
2904 buildTree_rec(Operands, Depth + 1, {TE, i});
2905 }
2906 return;
2907 }
2908 case Instruction::Store: {
2909 // Check if the stores are consecutive or if we need to swizzle them.
2910 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
2911 // Make sure all stores in the bundle are simple - we can't vectorize
2912 // atomic or volatile stores.
2913 SmallVector<Value *, 4> PointerOps(VL.size());
2914 ValueList Operands(VL.size());
2915 auto POIter = PointerOps.begin();
2916 auto OIter = Operands.begin();
2917 for (Value *V : VL) {
2918 auto *SI = cast<StoreInst>(V);
2919 if (!SI->isSimple()) {
2920 BS.cancelScheduling(VL, VL0);
2921 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2922 ReuseShuffleIndicies);
2923 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple stores.\n"
; } } while (false)
;
2924 return;
2925 }
2926 *POIter = SI->getPointerOperand();
2927 *OIter = SI->getValueOperand();
2928 ++POIter;
2929 ++OIter;
2930 }
2931
2932 OrdersType CurrentOrder;
2933 // Check the order of pointer operands.
2934 if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
2935 Value *Ptr0;
2936 Value *PtrN;
2937 if (CurrentOrder.empty()) {
2938 Ptr0 = PointerOps.front();
2939 PtrN = PointerOps.back();
2940 } else {
2941 Ptr0 = PointerOps[CurrentOrder.front()];
2942 PtrN = PointerOps[CurrentOrder.back()];
2943 }
2944 const SCEV *Scev0 = SE->getSCEV(Ptr0);
2945 const SCEV *ScevN = SE->getSCEV(PtrN);
2946 const auto *Diff =
2947 dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
2948 uint64_t Size = DL->getTypeAllocSize(ScalarTy);
2949 // Check that the sorted pointer operands are consecutive.
2950 if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
2951 if (CurrentOrder.empty()) {
2952 // Original stores are consecutive and does not require reordering.
2953 ++NumOpsWantToKeepOriginalOrder;
2954 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
2955 UserTreeIdx, ReuseShuffleIndicies);
2956 TE->setOperandsInOrder();
2957 buildTree_rec(Operands, Depth + 1, {TE, 0});
2958 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of stores.\n"
; } } while (false)
;
2959 } else {
2960 // Need to reorder.
2961 auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
2962 ++(I->getSecond());
2963 TreeEntry *TE =
2964 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2965 ReuseShuffleIndicies, I->getFirst());
2966 TE->setOperandsInOrder();
2967 buildTree_rec(Operands, Depth + 1, {TE, 0});
2968 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of jumbled stores.\n"
; } } while (false)
;
2969 }
2970 return;
2971 }
2972 }
2973
2974 BS.cancelScheduling(VL, VL0);
2975 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2976 ReuseShuffleIndicies);
2977 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-consecutive store.\n"; }
} while (false)
;
2978 return;
2979 }
2980 case Instruction::Call: {
2981 // Check if the calls are all to the same vectorizable intrinsic.
2982 CallInst *CI = cast<CallInst>(VL0);
2983 // Check if this is an Intrinsic call or something that can be
2984 // represented by an intrinsic call
2985 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2986 if (!isTriviallyVectorizable(ID)) {
2987 BS.cancelScheduling(VL, VL0);
2988 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2989 ReuseShuffleIndicies);
2990 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-vectorizable call.\n"; }
} while (false)
;
2991 return;
2992 }
2993 Function *Int = CI->getCalledFunction();
2994 unsigned NumArgs = CI->getNumArgOperands();
2995 SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
2996 for (unsigned j = 0; j != NumArgs; ++j)
2997 if (hasVectorInstrinsicScalarOpd(ID, j))
2998 ScalarArgs[j] = CI->getArgOperand(j);
2999 for (Value *V : VL) {
3000 CallInst *CI2 = dyn_cast<CallInst>(V);
3001 if (!CI2 || CI2->getCalledFunction() != Int ||
3002 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
3003 !CI->hasIdenticalOperandBundleSchema(*CI2)) {
3004 BS.cancelScheduling(VL, VL0);
3005 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3006 ReuseShuffleIndicies);
3007 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *V << "\n"; } } while (false)
3008 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *V << "\n"; } } while (false)
;
3009 return;
3010 }
3011 // Some intrinsics have scalar arguments and should be same in order for
3012 // them to be vectorized.
3013 for (unsigned j = 0; j != NumArgs; ++j) {
3014 if (hasVectorInstrinsicScalarOpd(ID, j)) {
3015 Value *A1J = CI2->getArgOperand(j);
3016 if (ScalarArgs[j] != A1J) {
3017 BS.cancelScheduling(VL, VL0);
3018 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3019 ReuseShuffleIndicies);
3020 LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)
3021 << " argument " << ScalarArgs[j] << "!=" << A1Jdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)
3022 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)
;
3023 return;
3024 }
3025 }
3026 }
3027 // Verify that the bundle operands are identical between the two calls.
3028 if (CI->hasOperandBundles() &&
3029 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
3030 CI->op_begin() + CI->getBundleOperandsEndIndex(),
3031 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
3032 BS.cancelScheduling(VL, VL0);
3033 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3034 ReuseShuffleIndicies);
3035 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *V << '\n'; } } while
(false)
3036 << *CI << "!=" << *V << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *V << '\n'; } } while
(false)
;
3037 return;
3038 }
3039 }
3040
3041 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3042 ReuseShuffleIndicies);
3043 TE->setOperandsInOrder();
3044 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
3045 ValueList Operands;
3046 // Prepare the operand vector.
3047 for (Value *V : VL) {
3048 auto *CI2 = cast<CallInst>(V);
3049 Operands.push_back(CI2->getArgOperand(i));
3050 }
3051 buildTree_rec(Operands, Depth + 1, {TE, i});
3052 }
3053 return;
3054 }
3055 case Instruction::ShuffleVector: {
3056 // If this is not an alternate sequence of opcode like add-sub
3057 // then do not vectorize this instruction.
3058 if (!S.isAltShuffle()) {
3059 BS.cancelScheduling(VL, VL0);
3060 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3061 ReuseShuffleIndicies);
3062 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: ShuffleVector are not vectorized.\n"
; } } while (false)
;
3063 return;
3064 }
3065 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3066 ReuseShuffleIndicies);
3067 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a ShuffleVector op.\n"
; } } while (false)
;
3068
3069 // Reorder operands if reordering would enable vectorization.
3070 if (isa<BinaryOperator>(VL0)) {
3071 ValueList Left, Right;
3072 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
3073 TE->setOperand(0, Left);
3074 TE->setOperand(1, Right);
3075 buildTree_rec(Left, Depth + 1, {TE, 0});
3076 buildTree_rec(Right, Depth + 1, {TE, 1});
3077 return;
3078 }
3079
3080 TE->setOperandsInOrder();
3081 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
3082 ValueList Operands;
3083 // Prepare the operand vector.
3084 for (Value *V : VL)
3085 Operands.push_back(cast<Instruction>(V)->getOperand(i));
3086
3087 buildTree_rec(Operands, Depth + 1, {TE, i});
3088 }
3089 return;
3090 }
3091 default:
3092 BS.cancelScheduling(VL, VL0);
3093 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3094 ReuseShuffleIndicies);
3095 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering unknown instruction.\n"
; } } while (false)
;
3096 return;
3097 }
3098}
3099
3100unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
3101 unsigned N = 1;
3102 Type *EltTy = T;
3103
3104 while (isa<CompositeType>(EltTy)) {
3105 if (auto *ST = dyn_cast<StructType>(EltTy)) {
3106 // Check that struct is homogeneous.
3107 for (const auto *Ty : ST->elements())
3108 if (Ty != *ST->element_begin())
3109 return 0;
3110 N *= ST->getNumElements();
3111 EltTy = *ST->element_begin();
3112 } else {
3113 auto *SeqT = cast<SequentialType>(EltTy);
3114 N *= SeqT->getNumElements();
3115 EltTy = SeqT->getElementType();
3116 }
3117 }
3118
3119 if (!isValidElementType(EltTy))
3120 return 0;
3121 uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
3122 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
3123 return 0;
3124 return N;
3125}
3126
3127bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
3128 SmallVectorImpl<unsigned> &CurrentOrder) const {
3129 Instruction *E0 = cast<Instruction>(OpValue);
3130 assert(E0->getOpcode() == Instruction::ExtractElement ||((E0->getOpcode() == Instruction::ExtractElement || E0->
getOpcode() == Instruction::ExtractValue) ? static_cast<void
> (0) : __assert_fail ("E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3131, __PRETTY_FUNCTION__))
3131 E0->getOpcode() == Instruction::ExtractValue)((E0->getOpcode() == Instruction::ExtractElement || E0->
getOpcode() == Instruction::ExtractValue) ? static_cast<void
> (0) : __assert_fail ("E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3131, __PRETTY_FUNCTION__))
;
3132 assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode")((E0->getOpcode() == getSameOpcode(VL).getOpcode() &&
"Invalid opcode") ? static_cast<void> (0) : __assert_fail
("E0->getOpcode() == getSameOpcode(VL).getOpcode() && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3132, __PRETTY_FUNCTION__))
;
3133 // Check if all of the extracts come from the same vector and from the
3134 // correct offset.
3135 Value *Vec = E0->getOperand(0);
3136
3137 CurrentOrder.clear();
3138
3139 // We have to extract from a vector/aggregate with the same number of elements.
3140 unsigned NElts;
3141 if (E0->getOpcode() == Instruction::ExtractValue) {
3142 const DataLayout &DL = E0->getModule()->getDataLayout();
3143 NElts = canMapToVector(Vec->getType(), DL);
3144 if (!NElts)
3145 return false;
3146 // Check if load can be rewritten as load of vector.
3147 LoadInst *LI = dyn_cast<LoadInst>(Vec);
3148 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
3149 return false;
3150 } else {
3151 NElts = Vec->getType()->getVectorNumElements();
3152 }
3153
3154 if (NElts != VL.size())
3155 return false;
3156
3157 // Check that all of the indices extract from the correct offset.
3158 bool ShouldKeepOrder = true;
3159 unsigned E = VL.size();
3160 // Assign to all items the initial value E + 1 so we can check if the extract
3161 // instruction index was used already.
3162 // Also, later we can check that all the indices are used and we have a
3163 // consecutive access in the extract instructions, by checking that no
3164 // element of CurrentOrder still has value E + 1.
3165 CurrentOrder.assign(E, E + 1);
3166 unsigned I = 0;
3167 for (; I < E; ++I) {
3168 auto *Inst = cast<Instruction>(VL[I]);
3169 if (Inst->getOperand(0) != Vec)
3170 break;
3171 Optional<unsigned> Idx = getExtractIndex(Inst);
3172 if (!Idx)
3173 break;
3174 const unsigned ExtIdx = *Idx;
3175 if (ExtIdx != I) {
3176 if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
3177 break;
3178 ShouldKeepOrder = false;
3179 CurrentOrder[ExtIdx] = I;
3180 } else {
3181 if (CurrentOrder[I] != E + 1)
3182 break;
3183 CurrentOrder[I] = I;
3184 }
3185 }
3186 if (I < E) {
3187 CurrentOrder.clear();
3188 return false;
3189 }
3190
3191 return ShouldKeepOrder;
3192}
3193
3194bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
3195 return I->hasOneUse() ||
3196 std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
3197 return ScalarToTreeEntry.count(U) > 0;
3198 });
3199}
3200
3201int BoUpSLP::getEntryCost(TreeEntry *E) {
3202 ArrayRef<Value*> VL = E->Scalars;
3203
3204 Type *ScalarTy = VL[0]->getType();
3205 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
3206 ScalarTy = SI->getValueOperand()->getType();
3207 else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
3208 ScalarTy = CI->getOperand(0)->getType();
3209 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
3210
3211 // If we have computed a smaller type for the expression, update VecTy so
3212 // that the costs will be accurate.
3213 if (MinBWs.count(VL[0]))
3214 VecTy = VectorType::get(
3215 IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
3216
3217 unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
3218 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
3219 int ReuseShuffleCost = 0;
3220 if (NeedToShuffleReuses) {
3221 ReuseShuffleCost =
3222 TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
3223 }
3224 if (E->State == TreeEntry::NeedToGather) {
3225 if (allConstant(VL))
3226 return 0;
3227 if (isSplat(VL)) {
3228 return ReuseShuffleCost +
3229 TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
3230 }
3231 if (E->getOpcode() == Instruction::ExtractElement &&
3232 allSameType(VL) && allSameBlock(VL)) {
3233 Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
3234 if (ShuffleKind.hasValue()) {
3235 int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
3236 for (auto *V : VL) {
3237 // If all users of instruction are going to be vectorized and this
3238 // instruction itself is not going to be vectorized, consider this
3239 // instruction as dead and remove its cost from the final cost of the
3240 // vectorized tree.
3241 if (areAllUsersVectorized(cast<Instruction>(V)) &&
3242 !ScalarToTreeEntry.count(V)) {
3243 auto *IO = cast<ConstantInt>(
3244 cast<ExtractElementInst>(V)->getIndexOperand());
3245 Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
3246 IO->getZExtValue());
3247 }
3248 }
3249 return ReuseShuffleCost + Cost;
3250 }
3251 }
3252 return ReuseShuffleCost + getGatherCost(VL);
3253 }
3254 assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL")((E->getOpcode() && allSameType(VL) && allSameBlock
(VL) && "Invalid VL") ? static_cast<void> (0) :
__assert_fail ("E->getOpcode() && allSameType(VL) && allSameBlock(VL) && \"Invalid VL\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3254, __PRETTY_FUNCTION__))
;
3255 Instruction *VL0 = E->getMainOp();
3256 unsigned ShuffleOrOp =
3257 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
3258 switch (ShuffleOrOp) {
3259 case Instruction::PHI:
3260 return 0;
3261
3262 case Instruction::ExtractValue:
3263 case Instruction::ExtractElement:
3264 if (NeedToShuffleReuses) {
3265 unsigned Idx = 0;
3266 for (unsigned I : E->ReuseShuffleIndices) {
3267 if (ShuffleOrOp == Instruction::ExtractElement) {
3268 auto *IO = cast<ConstantInt>(
3269 cast<ExtractElementInst>(VL[I])->getIndexOperand());
3270 Idx = IO->getZExtValue();
3271 ReuseShuffleCost -= TTI->getVectorInstrCost(
3272 Instruction::ExtractElement, VecTy, Idx);
3273 } else {
3274 ReuseShuffleCost -= TTI->getVectorInstrCost(
3275 Instruction::ExtractElement, VecTy, Idx);
3276 ++Idx;
3277 }
3278 }
3279 Idx = ReuseShuffleNumbers;
3280 for (Value *V : VL) {
3281 if (ShuffleOrOp == Instruction::ExtractElement) {
3282 auto *IO = cast<ConstantInt>(
3283 cast<ExtractElementInst>(V)->getIndexOperand());
3284 Idx = IO->getZExtValue();
3285 } else {
3286 --Idx;
3287 }
3288 ReuseShuffleCost +=
3289 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
3290 }
3291 }
3292 if (E->State == TreeEntry::Vectorize) {
3293 int DeadCost = ReuseShuffleCost;
3294 if (!E->ReorderIndices.empty()) {
3295 // TODO: Merge this shuffle with the ReuseShuffleCost.
3296 DeadCost += TTI->getShuffleCost(
3297 TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
3298 }
3299 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3300 Instruction *E = cast<Instruction>(VL[i]);
3301 // If all users are going to be vectorized, instruction can be
3302 // considered as dead.
3303 // The same, if have only one user, it will be vectorized for sure.
3304 if (areAllUsersVectorized(E)) {
3305 // Take credit for instruction that will become dead.
3306 if (E->hasOneUse()) {
3307 Instruction *Ext = E->user_back();
3308 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
3309 all_of(Ext->users(),
3310 [](User *U) { return isa<GetElementPtrInst>(U); })) {
3311 // Use getExtractWithExtendCost() to calculate the cost of
3312 // extractelement/ext pair.
3313 DeadCost -= TTI->getExtractWithExtendCost(
3314 Ext->getOpcode(), Ext->getType(), VecTy, i);
3315 // Add back the cost of s|zext which is subtracted separately.
3316 DeadCost += TTI->getCastInstrCost(
3317 Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
3318 continue;
3319 }
3320 }
3321 DeadCost -=
3322 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
3323 }
3324 }
3325 return DeadCost;
3326 }
3327 return ReuseShuffleCost + getGatherCost(VL);
3328
3329 case Instruction::ZExt:
3330 case Instruction::SExt:
3331 case Instruction::FPToUI:
3332 case Instruction::FPToSI:
3333 case Instruction::FPExt:
3334 case Instruction::PtrToInt:
3335 case Instruction::IntToPtr:
3336 case Instruction::SIToFP:
3337 case Instruction::UIToFP:
3338 case Instruction::Trunc:
3339 case Instruction::FPTrunc:
3340 case Instruction::BitCast: {
3341 Type *SrcTy = VL0->getOperand(0)->getType();
3342 int ScalarEltCost =
3343 TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0);
3344 if (NeedToShuffleReuses) {
3345 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3346 }
3347
3348 // Calculate the cost of this instruction.
3349 int ScalarCost = VL.size() * ScalarEltCost;
3350
3351 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
3352 int VecCost = 0;
3353 // Check if the values are candidates to demote.
3354 if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
3355 VecCost = ReuseShuffleCost +
3356 TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);
3357 }
3358 return VecCost - ScalarCost;
3359 }
3360 case Instruction::FCmp:
3361 case Instruction::ICmp:
3362 case Instruction::Select: {
3363 // Calculate the cost of this instruction.
3364 int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
3365 Builder.getInt1Ty(), VL0);
3366 if (NeedToShuffleReuses) {
3367 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3368 }
3369 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
3370 int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
3371 int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0);
3372 return ReuseShuffleCost + VecCost - ScalarCost;
3373 }
3374 case Instruction::FNeg:
3375 case Instruction::Add:
3376 case Instruction::FAdd:
3377 case Instruction::Sub:
3378 case Instruction::FSub:
3379 case Instruction::Mul:
3380 case Instruction::FMul:
3381 case Instruction::UDiv:
3382 case Instruction::SDiv:
3383 case Instruction::FDiv:
3384 case Instruction::URem:
3385 case Instruction::SRem:
3386 case Instruction::FRem:
3387 case Instruction::Shl:
3388 case Instruction::LShr:
3389 case Instruction::AShr:
3390 case Instruction::And:
3391 case Instruction::Or:
3392 case Instruction::Xor: {
3393 // Certain instructions can be cheaper to vectorize if they have a
3394 // constant second vector operand.
3395 TargetTransformInfo::OperandValueKind Op1VK =
3396 TargetTransformInfo::OK_AnyValue;
3397 TargetTransformInfo::OperandValueKind Op2VK =
3398 TargetTransformInfo::OK_UniformConstantValue;
3399 TargetTransformInfo::OperandValueProperties Op1VP =
3400 TargetTransformInfo::OP_None;
3401 TargetTransformInfo::OperandValueProperties Op2VP =
3402 TargetTransformInfo::OP_PowerOf2;
3403
3404 // If all operands are exactly the same ConstantInt then set the
3405 // operand kind to OK_UniformConstantValue.
3406 // If instead not all operands are constants, then set the operand kind
3407 // to OK_AnyValue. If all operands are constants but not the same,
3408 // then set the operand kind to OK_NonUniformConstantValue.
3409 ConstantInt *CInt0 = nullptr;
3410 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3411 const Instruction *I = cast<Instruction>(VL[i]);
3412 unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
3413 ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
3414 if (!CInt) {
3415 Op2VK = TargetTransformInfo::OK_AnyValue;
3416 Op2VP = TargetTransformInfo::OP_None;
3417 break;
3418 }
3419 if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
3420 !CInt->getValue().isPowerOf2())
3421 Op2VP = TargetTransformInfo::OP_None;
3422 if (i == 0) {
3423 CInt0 = CInt;
3424 continue;
3425 }
3426 if (CInt0 != CInt)
3427 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
3428 }
3429
3430 SmallVector<const Value *, 4> Operands(VL0->operand_values());
3431 int ScalarEltCost = TTI->getArithmeticInstrCost(
3432 E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
3433 if (NeedToShuffleReuses) {
3434 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3435 }
3436 int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
3437 int VecCost = TTI->getArithmeticInstrCost(
3438 E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
3439 return ReuseShuffleCost + VecCost - ScalarCost;
3440 }
3441 case Instruction::GetElementPtr: {
3442 TargetTransformInfo::OperandValueKind Op1VK =
3443 TargetTransformInfo::OK_AnyValue;
3444 TargetTransformInfo::OperandValueKind Op2VK =
3445 TargetTransformInfo::OK_UniformConstantValue;
3446
3447 int ScalarEltCost =
3448 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
3449 if (NeedToShuffleReuses) {
3450 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3451 }
3452 int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
3453 int VecCost =
3454 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
3455 return ReuseShuffleCost + VecCost - ScalarCost;
3456 }
3457 case Instruction::Load: {
3458 // Cost of wide load - cost of scalar loads.
3459 MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment());
3460 int ScalarEltCost =
3461 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
3462 if (NeedToShuffleReuses) {
3463 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3464 }
3465 int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
3466 int VecLdCost =
3467 TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
3468 if (!E->ReorderIndices.empty()) {
3469 // TODO: Merge this shuffle with the ReuseShuffleCost.
3470 VecLdCost += TTI->getShuffleCost(
3471 TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
3472 }
3473 return ReuseShuffleCost + VecLdCost - ScalarLdCost;
3474 }
3475 case Instruction::Store: {
3476 // We know that we can merge the stores. Calculate the cost.
3477 bool IsReorder = !E->ReorderIndices.empty();
3478 auto *SI =
3479 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
3480 MaybeAlign Alignment(SI->getAlignment());
3481 int ScalarEltCost =
3482 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);
3483 if (NeedToShuffleReuses)
3484 ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3485 int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
3486 int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
3487 VecTy, Alignment, 0, VL0);
3488 if (IsReorder) {
3489 // TODO: Merge this shuffle with the ReuseShuffleCost.
3490 VecStCost += TTI->getShuffleCost(
3491 TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
3492 }
3493 return ReuseShuffleCost + VecStCost - ScalarStCost;
3494 }
3495 case Instruction::Call: {
3496 CallInst *CI = cast<CallInst>(VL0);
3497 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3498
3499 // Calculate the cost of the scalar and vector calls.
3500 SmallVector<Type *, 4> ScalarTys;
3501 for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)
3502 ScalarTys.push_back(CI->getArgOperand(op)->getType());
3503
3504 FastMathFlags FMF;
3505 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3506 FMF = FPMO->getFastMathFlags();
3507
3508 int ScalarEltCost =
3509 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
3510 if (NeedToShuffleReuses) {
3511 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3512 }
3513 int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
3514
3515 SmallVector<Value *, 4> Args(CI->arg_operands());
3516 int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
3517 VecTy->getNumElements());
3518
3519 LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost " << VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
3520 << " (" << VecCallCost << "-" << ScalarCallCost << ")"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost " << VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
3521 << " for " << *CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost " << VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
;
3522
3523 return ReuseShuffleCost + VecCallCost - ScalarCallCost;
3524 }
3525 case Instruction::ShuffleVector: {
3526 assert(E->isAltShuffle() &&((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3531, __PRETTY_FUNCTION__))
3527 ((Instruction::isBinaryOp(E->getOpcode()) &&((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3531, __PRETTY_FUNCTION__))
3528 Instruction::isBinaryOp(E->getAltOpcode())) ||((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3531, __PRETTY_FUNCTION__))
3529 (Instruction::isCast(E->getOpcode()) &&((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3531, __PRETTY_FUNCTION__))
3530 Instruction::isCast(E->getAltOpcode()))) &&((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3531, __PRETTY_FUNCTION__))
3531 "Invalid Shuffle Vector Operand")((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3531, __PRETTY_FUNCTION__))
;
3532 int ScalarCost = 0;
3533 if (NeedToShuffleReuses) {
3534 for (unsigned Idx : E->ReuseShuffleIndices) {
3535 Instruction *I = cast<Instruction>(VL[Idx]);
3536 ReuseShuffleCost -= TTI->getInstructionCost(
3537 I, TargetTransformInfo::TCK_RecipThroughput);
3538 }
3539 for (Value *V : VL) {
3540 Instruction *I = cast<Instruction>(V);
3541 ReuseShuffleCost += TTI->getInstructionCost(
3542 I, TargetTransformInfo::TCK_RecipThroughput);
3543 }
3544 }
3545 for (Value *V : VL) {
3546 Instruction *I = cast<Instruction>(V);
3547 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode")((E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"
) ? static_cast<void> (0) : __assert_fail ("E->isOpcodeOrAlt(I) && \"Unexpected main/alternate opcode\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3547, __PRETTY_FUNCTION__))
;
3548 ScalarCost += TTI->getInstructionCost(
3549 I, TargetTransformInfo::TCK_RecipThroughput);
3550 }
3551 // VecCost is equal to sum of the cost of creating 2 vectors
3552 // and the cost of creating shuffle.
3553 int VecCost = 0;
3554 if (Instruction::isBinaryOp(E->getOpcode())) {
3555 VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy);
3556 VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy);
3557 } else {
3558 Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
3559 Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
3560 VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
3561 VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
3562 VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty);
3563 VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty);
3564 }
3565 VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
3566 return ReuseShuffleCost + VecCost - ScalarCost;
3567 }
3568 default:
3569 llvm_unreachable("Unknown instruction")::llvm::llvm_unreachable_internal("Unknown instruction", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3569)
;
3570 }
3571}
3572
3573bool BoUpSLP::isFullyVectorizableTinyTree() const {
3574 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false)
3575 << VectorizableTree.size() << " is fully vectorizable .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false)
;
3576
3577 // We only handle trees of heights 1 and 2.
3578 if (VectorizableTree.size() == 1 &&
3579 VectorizableTree[0]->State == TreeEntry::Vectorize)
3580 return true;
3581
3582 if (VectorizableTree.size() != 2)
3583 return false;
3584
3585 // Handle splat and all-constants stores.
3586 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
3587 (allConstant(VectorizableTree[1]->Scalars) ||
3588 isSplat(VectorizableTree[1]->Scalars)))
3589 return true;
3590
3591 // Gathering cost would be too much for tiny trees.
3592 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
3593 VectorizableTree[1]->State == TreeEntry::NeedToGather)
3594 return false;
3595
3596 return true;
3597}
3598
3599bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
3600 if (RdxOpcode != Instruction::Or)
3601 return false;
3602
3603 unsigned NumElts = VectorizableTree[0]->Scalars.size();
3604 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
3605
3606 // Look past the reduction to find a source value. Arbitrarily follow the
3607 // path through operand 0 of any 'or'. Also, peek through optional
3608 // shift-left-by-constant.
3609 Value *ZextLoad = FirstReduced;
3610 while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
3611 match(ZextLoad, m_Shl(m_Value(), m_Constant())))
3612 ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
3613
3614 // Check if the input to the reduction is an extended load.
3615 Value *LoadPtr;
3616 if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
3617 return false;
3618
3619 // Require that the total load bit width is a legal integer type.
3620 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
3621 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
3622 Type *SrcTy = LoadPtr->getType()->getPointerElementType();
3623 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
3624 LLVMContext &Context = FirstReduced->getContext();
3625 if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))
3626 return false;
3627
3628 // Everything matched - assume that we can fold the whole sequence using
3629 // load combining.
3630 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Assume load combining for scalar reduction of "
<< *(cast<Instruction>(FirstReduced)) << "\n"
; } } while (false)
3631 << *(cast<Instruction>(FirstReduced)) << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Assume load combining for scalar reduction of "
<< *(cast<Instruction>(FirstReduced)) << "\n"
; } } while (false)
;
3632
3633 return true;
3634}
3635
3636bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
3637 // We can vectorize the tree if its size is greater than or equal to the
3638 // minimum size specified by the MinTreeSize command line option.
3639 if (VectorizableTree.size() >= MinTreeSize)
3640 return false;
3641
3642 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
3643 // can vectorize it if we can prove it fully vectorizable.
3644 if (isFullyVectorizableTinyTree())
3645 return false;
3646
3647 assert(VectorizableTree.empty()((VectorizableTree.empty() ? ExternalUses.empty() : true &&
"We shouldn't have any external users") ? static_cast<void
> (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3649, __PRETTY_FUNCTION__))
3648 ? ExternalUses.empty()((VectorizableTree.empty() ? ExternalUses.empty() : true &&
"We shouldn't have any external users") ? static_cast<void
> (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3649, __PRETTY_FUNCTION__))
3649 : true && "We shouldn't have any external users")((VectorizableTree.empty() ? ExternalUses.empty() : true &&
"We shouldn't have any external users") ? static_cast<void
> (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3649, __PRETTY_FUNCTION__))
;
3650
3651 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
3652 // vectorizable.
3653 return true;
3654}
3655
3656int BoUpSLP::getSpillCost() const {
3657 // Walk from the bottom of the tree to the top, tracking which values are
3658 // live. When we see a call instruction that is not part of our tree,
3659 // query TTI to see if there is a cost to keeping values live over it
3660 // (for example, if spills and fills are required).
3661 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
3662 int Cost = 0;
3663
3664 SmallPtrSet<Instruction*, 4> LiveValues;
3665 Instruction *PrevInst = nullptr;
3666
3667 for (const auto &TEPtr : VectorizableTree) {
3668 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
3669 if (!Inst)
3670 continue;
3671
3672 if (!PrevInst) {
3673 PrevInst = Inst;
3674 continue;
3675 }
3676
3677 // Update LiveValues.
3678 LiveValues.erase(PrevInst);
3679 for (auto &J : PrevInst->operands()) {
3680 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
3681 LiveValues.insert(cast<Instruction>(&*J));
3682 }
3683
3684 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)
3685 dbgs() << "SLP: #LV: " << LiveValues.size();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)
3686 for (auto *X : LiveValues)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)
3687 dbgs() << " " << X->getName();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)
3688 dbgs() << ", Looking at ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)
3689 Inst->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)
3690 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)
;
3691
3692 // Now find the sequence of instructions between PrevInst and Inst.
3693 unsigned NumCalls = 0;
3694 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
3695 PrevInstIt =
3696 PrevInst->getIterator().getReverse();
3697 while (InstIt != PrevInstIt) {
3698 if (PrevInstIt == PrevInst->getParent()->rend()) {
3699 PrevInstIt = Inst->getParent()->rbegin();
3700 continue;
3701 }
3702
3703 // Debug information does not impact spill cost.
3704 if ((isa<CallInst>(&*PrevInstIt) &&
3705 !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
3706 &*PrevInstIt != PrevInst)
3707 NumCalls++;
3708
3709 ++PrevInstIt;
3710 }
3711
3712 if (NumCalls) {
3713 SmallVector<Type*, 4> V;
3714 for (auto *II : LiveValues)
3715 V.push_back(VectorType::get(II->getType(), BundleWidth));
3716 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
3717 }
3718
3719 PrevInst = Inst;
3720 }
3721
3722 return Cost;
3723}
3724
3725int BoUpSLP::getTreeCost() {
3726 int Cost = 0;
3727 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n"; } } while (
false)
3728 << VectorizableTree.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n"; } } while (
false)
;
3729
3730 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
3731
3732 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
3733 TreeEntry &TE = *VectorizableTree[I].get();
3734
3735 // We create duplicate tree entries for gather sequences that have multiple
3736 // uses. However, we should not compute the cost of duplicate sequences.
3737 // For example, if we have a build vector (i.e., insertelement sequence)
3738 // that is used by more than one vector instruction, we only need to
3739 // compute the cost of the insertelement instructions once. The redundant
3740 // instructions will be eliminated by CSE.
3741 //
3742 // We should consider not creating duplicate tree entries for gather
3743 // sequences, and instead add additional edges to the tree representing
3744 // their uses. Since such an approach results in fewer total entries,
3745 // existing heuristics based on tree size may yield different results.
3746 //
3747 if (TE.State == TreeEntry::NeedToGather &&
3748 std::any_of(std::next(VectorizableTree.begin(), I + 1),
3749 VectorizableTree.end(),
3750 [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
3751 return EntryPtr->State == TreeEntry::NeedToGather &&
3752 EntryPtr->isSame(TE.Scalars);
3753 }))
3754 continue;
3755
3756 int C = getEntryCost(&TE);
3757 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << Cdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n"; } } while (false)
3758 << " for bundle that starts with " << *TE.Scalars[0]do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n"; } } while (false)
3759 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n"; } } while (false)
;
3760 Cost += C;
3761 }
3762
3763 SmallPtrSet<Value *, 16> ExtractCostCalculated;
3764 int ExtractCost = 0;
3765 for (ExternalUser &EU : ExternalUses) {
3766 // We only add extract cost once for the same scalar.
3767 if (!ExtractCostCalculated.insert(EU.Scalar).second)
3768 continue;
3769
3770 // Uses by ephemeral values are free (because the ephemeral value will be
3771 // removed prior to code generation, and so the extraction will be
3772 // removed as well).
3773 if (EphValues.count(EU.User))
3774 continue;
3775
3776 // If we plan to rewrite the tree in a smaller type, we will need to sign
3777 // extend the extracted value back to the original type. Here, we account
3778 // for the extract and the added cost of the sign extend if needed.
3779 auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
3780 auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
3781 if (MinBWs.count(ScalarRoot)) {
3782 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
3783 auto Extend =
3784 MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
3785 VecTy = VectorType::get(MinTy, BundleWidth);
3786 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
3787 VecTy, EU.Lane);
3788 } else {
3789 ExtractCost +=
3790 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
3791 }
3792 }
3793
3794 int SpillCost = getSpillCost();
3795 Cost += SpillCost + ExtractCost;
3796
3797 std::string Str;
3798 {
3799 raw_string_ostream OS(Str);
3800 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
3801 << "SLP: Extract Cost = " << ExtractCost << ".\n"
3802 << "SLP: Total Cost = " << Cost << ".\n";
3803 }
3804 LLVM_DEBUG(dbgs() << Str)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << Str; } } while (false)
;
3805
3806 if (ViewSLPTree)
3807 ViewGraph(this, "SLP" + F->getName(), false, Str);
3808
3809 return Cost;
3810}
3811
3812int BoUpSLP::getGatherCost(Type *Ty,
3813 const DenseSet<unsigned> &ShuffledIndices) const {
3814 int Cost = 0;
3815 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
3816 if (!ShuffledIndices.count(i))
3817 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
3818 if (!ShuffledIndices.empty())
3819 Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
3820 return Cost;
3821}
3822
3823int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
3824 // Find the type of the operands in VL.
3825 Type *ScalarTy = VL[0]->getType();
3826 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
3827 ScalarTy = SI->getValueOperand()->getType();
3828 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
3829 // Find the cost of inserting/extracting values from the vector.
3830 // Check if the same elements are inserted several times and count them as
3831 // shuffle candidates.
3832 DenseSet<unsigned> ShuffledElements;
3833 DenseSet<Value *> UniqueElements;
3834 // Iterate in reverse order to consider insert elements with the high cost.
3835 for (unsigned I = VL.size(); I > 0; --I) {
3836 unsigned Idx = I - 1;
3837 if (!UniqueElements.insert(VL[Idx]).second)
3838 ShuffledElements.insert(Idx);
3839 }
3840 return getGatherCost(VecTy, ShuffledElements);
3841}
3842
3843// Perform operand reordering on the instructions in VL and return the reordered
3844// operands in Left and Right.
3845void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
3846 SmallVectorImpl<Value *> &Left,
3847 SmallVectorImpl<Value *> &Right,
3848 const DataLayout &DL,
3849 ScalarEvolution &SE,
3850 const BoUpSLP &R) {
3851 if (VL.empty())
3852 return;
3853 VLOperands Ops(VL, DL, SE, R);
3854 // Reorder the operands in place.
3855 Ops.reorder();
3856 Left = Ops.getVL(0);
3857 Right = Ops.getVL(1);
3858}
3859
3860void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
3861 // Get the basic block this bundle is in. All instructions in the bundle
3862 // should be in this block.
3863 auto *Front = E->getMainOp();
3864 auto *BB = Front->getParent();
3865 assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),((llvm::all_of(make_range(E->Scalars.begin(), E->Scalars
.end()), [=](Value *V) -> bool { auto *I = cast<Instruction
>(V); return !E->isOpcodeOrAlt(I) || I->getParent() ==
BB; })) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), [=](Value *V) -> bool { auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3869, __PRETTY_FUNCTION__))
3866 [=](Value *V) -> bool {((llvm::all_of(make_range(E->Scalars.begin(), E->Scalars
.end()), [=](Value *V) -> bool { auto *I = cast<Instruction
>(V); return !E->isOpcodeOrAlt(I) || I->getParent() ==
BB; })) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), [=](Value *V) -> bool { auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3869, __PRETTY_FUNCTION__))
3867 auto *I = cast<Instruction>(V);((llvm::all_of(make_range(E->Scalars.begin(), E->Scalars
.end()), [=](Value *V) -> bool { auto *I = cast<Instruction
>(V); return !E->isOpcodeOrAlt(I) || I->getParent() ==
BB; })) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), [=](Value *V) -> bool { auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3869, __PRETTY_FUNCTION__))
3868 return !E->isOpcodeOrAlt(I) || I->getParent() == BB;((llvm::all_of(make_range(E->Scalars.begin(), E->Scalars
.end()), [=](Value *V) -> bool { auto *I = cast<Instruction
>(V); return !E->isOpcodeOrAlt(I) || I->getParent() ==
BB; })) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), [=](Value *V) -> bool { auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3869, __PRETTY_FUNCTION__))
3869 }))((llvm::all_of(make_range(E->Scalars.begin(), E->Scalars
.end()), [=](Value *V) -> bool { auto *I = cast<Instruction
>(V); return !E->isOpcodeOrAlt(I) || I->getParent() ==
BB; })) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), [=](Value *V) -> bool { auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB; })"
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3869, __PRETTY_FUNCTION__))
;
3870
3871 // The last instruction in the bundle in program order.
3872 Instruction *LastInst = nullptr;
3873
3874 // Find the last instruction. The common case should be that BB has been
3875 // scheduled, and the last instruction is VL.back(). So we start with
3876 // VL.back() and iterate over schedule data until we reach the end of the
3877 // bundle. The end of the bundle is marked by null ScheduleData.
3878 if (BlocksSchedules.count(BB)) {
3879 auto *Bundle =
3880 BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
3881 if (Bundle && Bundle->isPartOfBundle())
3882 for (; Bundle; Bundle = Bundle->NextInBundle)
3883 if (Bundle->OpValue == Bundle->Inst)
3884 LastInst = Bundle->Inst;
3885 }
3886
3887 // LastInst can still be null at this point if there's either not an entry
3888 // for BB in BlocksSchedules or there's no ScheduleData available for
3889 // VL.back(). This can be the case if buildTree_rec aborts for various
3890 // reasons (e.g., the maximum recursion depth is reached, the maximum region
3891 // size is reached, etc.). ScheduleData is initialized in the scheduling
3892 // "dry-run".
3893 //
3894 // If this happens, we can still find the last instruction by brute force. We
3895 // iterate forwards from Front (inclusive) until we either see all
3896 // instructions in the bundle or reach the end of the block. If Front is the
3897 // last instruction in program order, LastInst will be set to Front, and we
3898 // will visit all the remaining instructions in the block.
3899 //
3900 // One of the reasons we exit early from buildTree_rec is to place an upper
3901 // bound on compile-time. Thus, taking an additional compile-time hit here is
3902 // not ideal. However, this should be exceedingly rare since it requires that
3903 // we both exit early from buildTree_rec and that the bundle be out-of-order
3904 // (causing us to iterate all the way to the end of the block).
3905 if (!LastInst) {
3906 SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
3907 for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
3908 if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
3909 LastInst = &I;
3910 if (Bundle.empty())
3911 break;
3912 }
3913 }
3914 assert(LastInst && "Failed to find last instruction in bundle")((LastInst && "Failed to find last instruction in bundle"
) ? static_cast<void> (0) : __assert_fail ("LastInst && \"Failed to find last instruction in bundle\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3914, __PRETTY_FUNCTION__))
;
3915
3916 // Set the insertion point after the last instruction in the bundle. Set the
3917 // debug location to Front.
3918 Builder.SetInsertPoint(BB, ++LastInst->getIterator());
3919 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
3920}
3921
3922Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
3923 Value *Vec = UndefValue::get(Ty);
3924 // Generate the 'InsertElement' instruction.
3925 for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
3926 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
3927 if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
3928 GatherSeq.insert(Insrt);
3929 CSEBlocks.insert(Insrt->getParent());
3930
3931 // Add to our 'need-to-extract' list.
3932 if (TreeEntry *E = getTreeEntry(VL[i])) {
3933 // Find which lane we need to extract.
3934 int FoundLane = -1;
3935 for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
3936 // Is this the lane of the scalar that we are looking for ?
3937 if (E->Scalars[Lane] == VL[i]) {
3938 FoundLane = Lane;
3939 break;
3940 }
3941 }
3942 assert(FoundLane >= 0 && "Could not find the correct lane")((FoundLane >= 0 && "Could not find the correct lane"
) ? static_cast<void> (0) : __assert_fail ("FoundLane >= 0 && \"Could not find the correct lane\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3942, __PRETTY_FUNCTION__))
;
3943 if (!E->ReuseShuffleIndices.empty()) {
3944 FoundLane =
3945 std::distance(E->ReuseShuffleIndices.begin(),
3946 llvm::find(E->ReuseShuffleIndices, FoundLane));
3947 }
3948 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
3949 }
3950 }
3951 }
3952
3953 return Vec;
3954}
3955
3956Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
3957 InstructionsState S = getSameOpcode(VL);
3958 if (S.getOpcode()) {
3959 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
3960 if (E->isSame(VL)) {
3961 Value *V = vectorizeTree(E);
3962 if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
3963 // We need to get the vectorized value but without shuffle.
3964 if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
3965 V = SV->getOperand(0);
3966 } else {
3967 // Reshuffle to get only unique values.
3968 SmallVector<unsigned, 4> UniqueIdxs;
3969 SmallSet<unsigned, 4> UsedIdxs;
3970 for(unsigned Idx : E->ReuseShuffleIndices)
3971 if (UsedIdxs.insert(Idx).second)
3972 UniqueIdxs.emplace_back(Idx);
3973 V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
3974 UniqueIdxs);
3975 }
3976 }
3977 return V;
3978 }
3979 }
3980 }
3981
3982 Type *ScalarTy = S.OpValue->getType();
3983 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
3984 ScalarTy = SI->getValueOperand()->getType();
3985
3986 // Check that every instruction appears once in this bundle.
3987 SmallVector<unsigned, 4> ReuseShuffleIndicies;
3988 SmallVector<Value *, 4> UniqueValues;
3989 if (VL.size() > 2) {
3990 DenseMap<Value *, unsigned> UniquePositions;
3991 for (Value *V : VL) {
3992 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
3993 ReuseShuffleIndicies.emplace_back(Res.first->second);
3994 if (Res.second || isa<Constant>(V))
3995 UniqueValues.emplace_back(V);
3996 }
3997 // Do not shuffle single element or if number of unique values is not power
3998 // of 2.
3999 if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
4000 !llvm::isPowerOf2_32(UniqueValues.size()))
4001 ReuseShuffleIndicies.clear();
4002 else
4003 VL = UniqueValues;
4004 }
4005 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
4006
4007 Value *V = Gather(VL, VecTy);
4008 if (!ReuseShuffleIndicies.empty()) {
4009 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4010 ReuseShuffleIndicies, "shuffle");
4011 if (auto *I = dyn_cast<Instruction>(V)) {
4012 GatherSeq.insert(I);
4013 CSEBlocks.insert(I->getParent());
4014 }
4015 }
4016 return V;
4017}
4018
4019static void inversePermutation(ArrayRef<unsigned> Indices,
4020 SmallVectorImpl<unsigned> &Mask) {
4021 Mask.clear();
4022 const unsigned E = Indices.size();
4023 Mask.resize(E);
4024 for (unsigned I = 0; I < E; ++I)
4025 Mask[Indices[I]] = I;
4026}
4027
4028Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
4029 IRBuilder<>::InsertPointGuard Guard(Builder);
4030
4031 if (E->VectorizedValue) {
4032 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*E->Scalars[0] << ".\n"; } } while (false)
;
4033 return E->VectorizedValue;
4034 }
4035
4036 Instruction *VL0 = E->getMainOp();
4037 Type *ScalarTy = VL0->getType();
4038 if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
4039 ScalarTy = SI->getValueOperand()->getType();
4040 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
4041
4042 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
4043
4044 if (E->State == TreeEntry::NeedToGather) {
4045 setInsertPointAfterBundle(E);
4046 auto *V = Gather(E->Scalars, VecTy);
4047 if (NeedToShuffleReuses) {
4048 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4049 E->ReuseShuffleIndices, "shuffle");
4050 if (auto *I = dyn_cast<Instruction>(V)) {
4051 GatherSeq.insert(I);
4052 CSEBlocks.insert(I->getParent());
4053 }
4054 }
4055 E->VectorizedValue = V;
4056 return V;
4057 }
4058
4059 unsigned ShuffleOrOp =
4060 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
4061 switch (ShuffleOrOp) {
4062 case Instruction::PHI: {
4063 auto *PH = cast<PHINode>(VL0);
4064 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
4065 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
4066 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
4067 Value *V = NewPhi;
4068 if (NeedToShuffleReuses) {
4069 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4070 E->ReuseShuffleIndices, "shuffle");
4071 }
4072 E->VectorizedValue = V;
4073
4074 // PHINodes may have multiple entries from the same block. We want to
4075 // visit every block once.
4076 SmallPtrSet<BasicBlock*, 4> VisitedBBs;
4077
4078 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
4079 ValueList Operands;
4080 BasicBlock *IBB = PH->getIncomingBlock(i);
4081
4082 if (!VisitedBBs.insert(IBB).second) {
4083 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
4084 continue;
4085 }
4086
4087 Builder.SetInsertPoint(IBB->getTerminator());
4088 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
4089 Value *Vec = vectorizeTree(E->getOperand(i));
4090 NewPhi->addIncoming(Vec, IBB);
4091 }
4092
4093 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&((NewPhi->getNumIncomingValues() == PH->getNumIncomingValues
() && "Invalid number of incoming values") ? static_cast
<void> (0) : __assert_fail ("NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && \"Invalid number of incoming values\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4094, __PRETTY_FUNCTION__))
4094 "Invalid number of incoming values")((NewPhi->getNumIncomingValues() == PH->getNumIncomingValues
() && "Invalid number of incoming values") ? static_cast
<void> (0) : __assert_fail ("NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && \"Invalid number of incoming values\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4094, __PRETTY_FUNCTION__))
;
4095 return V;
4096 }
4097
4098 case Instruction::ExtractElement: {
4099 if (E->State == TreeEntry::Vectorize) {
4100 Value *V = E->getSingleOperand(0);
4101 if (!E->ReorderIndices.empty()) {
4102 OrdersType Mask;
4103 inversePermutation(E->ReorderIndices, Mask);
4104 Builder.SetInsertPoint(VL0);
4105 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
4106 "reorder_shuffle");
4107 }
4108 if (NeedToShuffleReuses) {
4109 // TODO: Merge this shuffle with the ReorderShuffleMask.
4110 if (E->ReorderIndices.empty())
4111 Builder.SetInsertPoint(VL0);
4112 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4113 E->ReuseShuffleIndices, "shuffle");
4114 }
4115 E->VectorizedValue = V;
4116 return V;
4117 }
4118 setInsertPointAfterBundle(E);
4119 auto *V = Gather(E->Scalars, VecTy);
4120 if (NeedToShuffleReuses) {
4121 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4122 E->ReuseShuffleIndices, "shuffle");
4123 if (auto *I = dyn_cast<Instruction>(V)) {
4124 GatherSeq.insert(I);
4125 CSEBlocks.insert(I->getParent());
4126 }
4127 }
4128 E->VectorizedValue = V;
4129 return V;
4130 }
4131 case Instruction::ExtractValue: {
4132 if (E->State == TreeEntry::Vectorize) {
4133 LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
4134 Builder.SetInsertPoint(LI);
4135 PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
4136 Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
4137 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment());
4138 Value *NewV = propagateMetadata(V, E->Scalars);
4139 if (!E->ReorderIndices.empty()) {
4140 OrdersType Mask;
4141 inversePermutation(E->ReorderIndices, Mask);
4142 NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
4143 "reorder_shuffle");
4144 }
4145 if (NeedToShuffleReuses) {
4146 // TODO: Merge this shuffle with the ReorderShuffleMask.
4147 NewV = Builder.CreateShuffleVector(
4148 NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
4149 }
4150 E->VectorizedValue = NewV;
4151 return NewV;
4152 }
4153 setInsertPointAfterBundle(E);
4154 auto *V = Gather(E->Scalars, VecTy);
4155 if (NeedToShuffleReuses) {
4156 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4157 E->ReuseShuffleIndices, "shuffle");
4158 if (auto *I = dyn_cast<Instruction>(V)) {
4159 GatherSeq.insert(I);
4160 CSEBlocks.insert(I->getParent());
4161 }
4162 }
4163 E->VectorizedValue = V;
4164 return V;
4165 }
4166 case Instruction::ZExt:
4167 case Instruction::SExt:
4168 case Instruction::FPToUI:
4169 case Instruction::FPToSI:
4170 case Instruction::FPExt:
4171 case Instruction::PtrToInt:
4172 case Instruction::IntToPtr:
4173 case Instruction::SIToFP:
4174 case Instruction::UIToFP:
4175 case Instruction::Trunc:
4176 case Instruction::FPTrunc:
4177 case Instruction::BitCast: {
4178 setInsertPointAfterBundle(E);
4179
4180 Value *InVec = vectorizeTree(E->getOperand(0));
4181
4182 if (E->VectorizedValue) {
4183 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
4184 return E->VectorizedValue;
4185 }
4186
4187 auto *CI = cast<CastInst>(VL0);
4188 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
4189 if (NeedToShuffleReuses) {
4190 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4191 E->ReuseShuffleIndices, "shuffle");
4192 }
4193 E->VectorizedValue = V;
4194 ++NumVectorInstructions;
4195 return V;
4196 }
4197 case Instruction::FCmp:
4198 case Instruction::ICmp: {
4199 setInsertPointAfterBundle(E);
4200
4201 Value *L = vectorizeTree(E->getOperand(0));
4202 Value *R = vectorizeTree(E->getOperand(1));
4203
4204 if (E->VectorizedValue) {
4205 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
4206 return E->VectorizedValue;
4207 }
4208
4209 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
4210 Value *V;
4211 if (E->getOpcode() == Instruction::FCmp)
4212 V = Builder.CreateFCmp(P0, L, R);
4213 else
4214 V = Builder.CreateICmp(P0, L, R);
4215
4216 propagateIRFlags(V, E->Scalars, VL0);
4217 if (NeedToShuffleReuses) {
4218 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4219 E->ReuseShuffleIndices, "shuffle");
4220 }
4221 E->VectorizedValue = V;
4222 ++NumVectorInstructions;
4223 return V;
4224 }
4225 case Instruction::Select: {
4226 setInsertPointAfterBundle(E);
4227
4228 Value *Cond = vectorizeTree(E->getOperand(0));
4229 Value *True = vectorizeTree(E->getOperand(1));
4230 Value *False = vectorizeTree(E->getOperand(2));
4231
4232 if (E->VectorizedValue) {
4233 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
4234 return E->VectorizedValue;
4235 }
4236
4237 Value *V = Builder.CreateSelect(Cond, True, False);
4238 if (NeedToShuffleReuses) {
4239 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4240 E->ReuseShuffleIndices, "shuffle");
4241 }
4242 E->VectorizedValue = V;
4243 ++NumVectorInstructions;
4244 return V;
4245 }
4246 case Instruction::FNeg: {
4247 setInsertPointAfterBundle(E);
4248
4249 Value *Op = vectorizeTree(E->getOperand(0));
4250
4251 if (E->VectorizedValue) {
4252 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
4253 return E->VectorizedValue;
4254 }
4255
4256 Value *V = Builder.CreateUnOp(
4257 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
4258 propagateIRFlags(V, E->Scalars, VL0);
4259 if (auto *I = dyn_cast<Instruction>(V))
4260 V = propagateMetadata(I, E->Scalars);
4261
4262 if (NeedToShuffleReuses) {
4263 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4264 E->ReuseShuffleIndices, "shuffle");
4265 }
4266 E->VectorizedValue = V;
4267 ++NumVectorInstructions;
4268
4269 return V;
4270 }
4271 case Instruction::Add:
4272 case Instruction::FAdd:
4273 case Instruction::Sub:
4274 case Instruction::FSub:
4275 case Instruction::Mul:
4276 case Instruction::FMul:
4277 case Instruction::UDiv:
4278 case Instruction::SDiv:
4279 case Instruction::FDiv:
4280 case Instruction::URem:
4281 case Instruction::SRem:
4282 case Instruction::FRem:
4283 case Instruction::Shl:
4284 case Instruction::LShr:
4285 case Instruction::AShr:
4286 case Instruction::And:
4287 case Instruction::Or:
4288 case Instruction::Xor: {
4289 setInsertPointAfterBundle(E);
4290
4291 Value *LHS = vectorizeTree(E->getOperand(0));
4292 Value *RHS = vectorizeTree(E->getOperand(1));
4293
4294 if (E->VectorizedValue) {
4295 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
4296 return E->VectorizedValue;
4297 }
4298
4299 Value *V = Builder.CreateBinOp(
4300 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
4301 RHS);
4302 propagateIRFlags(V, E->Scalars, VL0);
4303 if (auto *I = dyn_cast<Instruction>(V))
4304 V = propagateMetadata(I, E->Scalars);
4305
4306 if (NeedToShuffleReuses) {
4307 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4308 E->ReuseShuffleIndices, "shuffle");
4309 }
4310 E->VectorizedValue = V;
4311 ++NumVectorInstructions;
4312
4313 return V;
4314 }
4315 case Instruction::Load: {
4316 // Loads are inserted at the head of the tree because we don't want to
4317 // sink them all the way down past store instructions.
4318 bool IsReorder = E->updateStateIfReorder();
4319 if (IsReorder)
4320 VL0 = E->getMainOp();
4321 setInsertPointAfterBundle(E);
4322
4323 LoadInst *LI = cast<LoadInst>(VL0);
4324 Type *ScalarLoadTy = LI->getType();
4325 unsigned AS = LI->getPointerAddressSpace();
4326
4327 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
4328 VecTy->getPointerTo(AS));
4329
4330 // The pointer operand uses an in-tree scalar so we add the new BitCast to
4331 // ExternalUses list to make sure that an extract will be generated in the
4332 // future.
4333 Value *PO = LI->getPointerOperand();
4334 if (getTreeEntry(PO))
4335 ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
4336
4337 MaybeAlign Alignment = MaybeAlign(LI->getAlignment());
4338 LI = Builder.CreateLoad(VecTy, VecPtr);
4339 if (!Alignment)
4340 Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy));
4341 LI->setAlignment(Alignment);
4342 Value *V = propagateMetadata(LI, E->Scalars);
4343 if (IsReorder) {
4344 OrdersType Mask;
4345 inversePermutation(E->ReorderIndices, Mask);
4346 V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
4347 Mask, "reorder_shuffle");
4348 }
4349 if (NeedToShuffleReuses) {
4350 // TODO: Merge this shuffle with the ReorderShuffleMask.
4351 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4352 E->ReuseShuffleIndices, "shuffle");
4353 }
4354 E->VectorizedValue = V;
4355 ++NumVectorInstructions;
4356 return V;
4357 }
4358 case Instruction::Store: {
4359 bool IsReorder = !E->ReorderIndices.empty();
4360 auto *SI = cast<StoreInst>(
4361 IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
4362 unsigned Alignment = SI->getAlignment();
4363 unsigned AS = SI->getPointerAddressSpace();
4364
4365 setInsertPointAfterBundle(E);
4366
4367 Value *VecValue = vectorizeTree(E->getOperand(0));
4368 if (IsReorder) {
4369 OrdersType Mask;
4370 inversePermutation(E->ReorderIndices, Mask);
4371 VecValue = Builder.CreateShuffleVector(
4372 VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices,
4373 "reorder_shuffle");
4374 }
4375 Value *ScalarPtr = SI->getPointerOperand();
4376 Value *VecPtr = Builder.CreateBitCast(
4377 ScalarPtr, VecValue->getType()->getPointerTo(AS));
4378 StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
4379
4380 // The pointer operand uses an in-tree scalar, so add the new BitCast to
4381 // ExternalUses to make sure that an extract will be generated in the
4382 // future.
4383 if (getTreeEntry(ScalarPtr))
4384 ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
4385
4386 if (!Alignment)
4387 Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
4388
4389 ST->setAlignment(Align(Alignment));
4390 Value *V = propagateMetadata(ST, E->Scalars);
4391 if (NeedToShuffleReuses) {
4392 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4393 E->ReuseShuffleIndices, "shuffle");
4394 }
4395 E->VectorizedValue = V;
4396 ++NumVectorInstructions;
4397 return V;
4398 }
4399 case Instruction::GetElementPtr: {
4400 setInsertPointAfterBundle(E);
4401
4402 Value *Op0 = vectorizeTree(E->getOperand(0));
4403
4404 std::vector<Value *> OpVecs;
4405 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
4406 ++j) {
4407 ValueList &VL = E->getOperand(j);
4408 // Need to cast all elements to the same type before vectorization to
4409 // avoid crash.
4410 Type *VL0Ty = VL0->getOperand(j)->getType();
4411 Type *Ty = llvm::all_of(
4412 VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
4413 ? VL0Ty
4414 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
4415 ->getPointerOperandType()
4416 ->getScalarType());
4417 for (Value *&V : VL) {
4418 auto *CI = cast<ConstantInt>(V);
4419 V = ConstantExpr::getIntegerCast(CI, Ty,
4420 CI->getValue().isSignBitSet());
4421 }
4422 Value *OpVec = vectorizeTree(VL);
4423 OpVecs.push_back(OpVec);
4424 }
4425
4426 Value *V = Builder.CreateGEP(
4427 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
4428 if (Instruction *I = dyn_cast<Instruction>(V))
4429 V = propagateMetadata(I, E->Scalars);
4430
4431 if (NeedToShuffleReuses) {
4432 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4433 E->ReuseShuffleIndices, "shuffle");
4434 }
4435 E->VectorizedValue = V;
4436 ++NumVectorInstructions;
4437
4438 return V;
4439 }
4440 case Instruction::Call: {
4441 CallInst *CI = cast<CallInst>(VL0);
4442 setInsertPointAfterBundle(E);
4443
4444 Intrinsic::ID IID = Intrinsic::not_intrinsic;
4445 if (Function *FI = CI->getCalledFunction())
4446 IID = FI->getIntrinsicID();
4447
4448 Value *ScalarArg = nullptr;
4449 std::vector<Value *> OpVecs;
4450 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
4451 ValueList OpVL;
4452 // Some intrinsics have scalar arguments. This argument should not be
4453 // vectorized.
4454 if (hasVectorInstrinsicScalarOpd(IID, j)) {
4455 CallInst *CEI = cast<CallInst>(VL0);
4456 ScalarArg = CEI->getArgOperand(j);
4457 OpVecs.push_back(CEI->getArgOperand(j));
4458 continue;
4459 }
4460
4461 Value *OpVec = vectorizeTree(E->getOperand(j));
4462 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: OpVec[" << j << "]: "
<< *OpVec << "\n"; } } while (false)
;
4463 OpVecs.push_back(OpVec);
4464 }
4465
4466 Module *M = F->getParent();
4467 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4468 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
4469 Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
4470 SmallVector<OperandBundleDef, 1> OpBundles;
4471 CI->getOperandBundlesAsDefs(OpBundles);
4472 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
4473
4474 // The scalar argument uses an in-tree scalar so we add the new vectorized
4475 // call to ExternalUses list to make sure that an extract will be
4476 // generated in the future.
4477 if (ScalarArg && getTreeEntry(ScalarArg))
4478 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
4479
4480 propagateIRFlags(V, E->Scalars, VL0);
4481 if (NeedToShuffleReuses) {
4482 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4483 E->ReuseShuffleIndices, "shuffle");
4484 }
4485 E->VectorizedValue = V;
4486 ++NumVectorInstructions;
4487 return V;
4488 }
4489 case Instruction::ShuffleVector: {
4490 assert(E->isAltShuffle() &&((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4495, __PRETTY_FUNCTION__))
4491 ((Instruction::isBinaryOp(E->getOpcode()) &&((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4495, __PRETTY_FUNCTION__))
4492 Instruction::isBinaryOp(E->getAltOpcode())) ||((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4495, __PRETTY_FUNCTION__))
4493 (Instruction::isCast(E->getOpcode()) &&((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4495, __PRETTY_FUNCTION__))
4494 Instruction::isCast(E->getAltOpcode()))) &&((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4495, __PRETTY_FUNCTION__))
4495 "Invalid Shuffle Vector Operand")((E->isAltShuffle() && ((Instruction::isBinaryOp(E
->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode
())) || (Instruction::isCast(E->getOpcode()) && Instruction
::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4495, __PRETTY_FUNCTION__))
;
4496
4497 Value *LHS = nullptr, *RHS = nullptr;
4498 if (Instruction::isBinaryOp(E->getOpcode())) {
4499 setInsertPointAfterBundle(E);
4500 LHS = vectorizeTree(E->getOperand(0));
4501 RHS = vectorizeTree(E->getOperand(1));
4502 } else {
4503 setInsertPointAfterBundle(E);
4504 LHS = vectorizeTree(E->getOperand(0));
4505 }
4506
4507 if (E->VectorizedValue) {
4508 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false)
;
4509 return E->VectorizedValue;
4510 }
4511
4512 Value *V0, *V1;
4513 if (Instruction::isBinaryOp(E->getOpcode())) {
4514 V0 = Builder.CreateBinOp(
4515 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
4516 V1 = Builder.CreateBinOp(
4517 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
4518 } else {
4519 V0 = Builder.CreateCast(
4520 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
4521 V1 = Builder.CreateCast(
4522 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
4523 }
4524
4525 // Create shuffle to take alternate operations from the vector.
4526 // Also, gather up main and alt scalar ops to propagate IR flags to
4527 // each vector operation.
4528 ValueList OpScalars, AltScalars;
4529 unsigned e = E->Scalars.size();
4530 SmallVector<Constant *, 8> Mask(e);
4531 for (unsigned i = 0; i < e; ++i) {
4532 auto *OpInst = cast<Instruction>(E->Scalars[i]);
4533 assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode")((E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"
) ? static_cast<void> (0) : __assert_fail ("E->isOpcodeOrAlt(OpInst) && \"Unexpected main/alternate opcode\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4533, __PRETTY_FUNCTION__))
;
4534 if (OpInst->getOpcode() == E->getAltOpcode()) {
4535 Mask[i] = Builder.getInt32(e + i);
4536 AltScalars.push_back(E->Scalars[i]);
4537 } else {
4538 Mask[i] = Builder.getInt32(i);
4539 OpScalars.push_back(E->Scalars[i]);
4540 }
4541 }
4542
4543 Value *ShuffleMask = ConstantVector::get(Mask);
4544 propagateIRFlags(V0, OpScalars);
4545 propagateIRFlags(V1, AltScalars);
4546
4547 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
4548 if (Instruction *I = dyn_cast<Instruction>(V))
4549 V = propagateMetadata(I, E->Scalars);
4550 if (NeedToShuffleReuses) {
4551 V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
4552 E->ReuseShuffleIndices, "shuffle");
4553 }
4554 E->VectorizedValue = V;
4555 ++NumVectorInstructions;
4556
4557 return V;
4558 }
4559 default:
4560 llvm_unreachable("unknown inst")::llvm::llvm_unreachable_internal("unknown inst", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4560)
;
4561 }
4562 return nullptr;
4563}
4564
4565Value *BoUpSLP::vectorizeTree() {
4566 ExtraValueToDebugLocsMap ExternallyUsedValues;
4567 return vectorizeTree(ExternallyUsedValues);
4568}
4569
4570Value *
4571BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
4572 // All blocks must be scheduled before any instructions are inserted.
4573 for (auto &BSIter : BlocksSchedules) {
4574 scheduleBlock(BSIter.second.get());
4575 }
4576
4577 Builder.SetInsertPoint(&F->getEntryBlock().front());
4578 auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
4579
4580 // If the vectorized tree can be rewritten in a smaller type, we truncate the
4581 // vectorized root. InstCombine will then rewrite the entire expression. We
4582 // sign extend the extracted values below.
4583 auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
4584 if (MinBWs.count(ScalarRoot)) {
4585 if (auto *I = dyn_cast<Instruction>(VectorRoot))
4586 Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
4587 auto BundleWidth = VectorizableTree[0]->Scalars.size();
4588 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
4589 auto *VecTy = VectorType::get(MinTy, BundleWidth);
4590 auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
4591 VectorizableTree[0]->VectorizedValue = Trunc;
4592 }
4593
4594 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Extracting " << ExternalUses
.size() << " values .\n"; } } while (false)
4595 << " values .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Extracting " << ExternalUses
.size() << " values .\n"; } } while (false)
;
4596
4597 // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
4598 // specified by ScalarType.
4599 auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
4600 if (!MinBWs.count(ScalarRoot))
4601 return Ex;
4602 if (MinBWs[ScalarRoot].second)
4603 return Builder.CreateSExt(Ex, ScalarType);
4604 return Builder.CreateZExt(Ex, ScalarType);
4605 };
4606
4607 // Extract all of the elements with the external uses.
4608 for (const auto &ExternalUse : ExternalUses) {
4609 Value *Scalar = ExternalUse.Scalar;
4610 llvm::User *User = ExternalUse.User;
4611
4612 // Skip users that we already RAUW. This happens when one instruction
4613 // has multiple uses of the same value.
4614 if (User && !is_contained(Scalar->users(), User))
4615 continue;
4616 TreeEntry *E = getTreeEntry(Scalar);
4617 assert(E && "Invalid scalar")((E && "Invalid scalar") ? static_cast<void> (0
) : __assert_fail ("E && \"Invalid scalar\"", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4617, __PRETTY_FUNCTION__))
;
4618 assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list")((E->State == TreeEntry::Vectorize && "Extracting from a gather list"
) ? static_cast<void> (0) : __assert_fail ("E->State == TreeEntry::Vectorize && \"Extracting from a gather list\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4618, __PRETTY_FUNCTION__))
;
4619
4620 Value *Vec = E->VectorizedValue;
4621 assert(Vec && "Can't find vectorizable value")((Vec && "Can't find vectorizable value") ? static_cast
<void> (0) : __assert_fail ("Vec && \"Can't find vectorizable value\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4621, __PRETTY_FUNCTION__))
;
4622
4623 Value *Lane = Builder.getInt32(ExternalUse.Lane);
4624 // If User == nullptr, the Scalar is used as extra arg. Generate
4625 // ExtractElement instruction and update the record for this scalar in
4626 // ExternallyUsedValues.
4627 if (!User) {
4628 assert(ExternallyUsedValues.count(Scalar) &&((ExternallyUsedValues.count(Scalar) && "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? static_cast<void> (0) : __assert_fail
("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4630, __PRETTY_FUNCTION__))
4629 "Scalar with nullptr as an external user must be registered in "((ExternallyUsedValues.count(Scalar) && "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? static_cast<void> (0) : __assert_fail
("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4630, __PRETTY_FUNCTION__))
4630 "ExternallyUsedValues map")((ExternallyUsedValues.count(Scalar) && "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? static_cast<void> (0) : __assert_fail
("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4630, __PRETTY_FUNCTION__))
;
4631 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
4632 Builder.SetInsertPoint(VecI->getParent(),
4633 std::next(VecI->getIterator()));
4634 } else {
4635 Builder.SetInsertPoint(&F->getEntryBlock().front());
4636 }
4637 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
4638 Ex = extend(ScalarRoot, Ex, Scalar->getType());
4639 CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
4640 auto &Locs = ExternallyUsedValues[Scalar];
4641 ExternallyUsedValues.insert({Ex, Locs});
4642 ExternallyUsedValues.erase(Scalar);
4643 // Required to update internally referenced instructions.
4644 Scalar->replaceAllUsesWith(Ex);
4645 continue;
4646 }
4647
4648 // Generate extracts for out-of-tree users.
4649 // Find the insertion point for the extractelement lane.
4650 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
4651 if (PHINode *PH = dyn_cast<PHINode>(User)) {
4652 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
4653 if (PH->getIncomingValue(i) == Scalar) {
4654 Instruction *IncomingTerminator =
4655 PH->getIncomingBlock(i)->getTerminator();
4656 if (isa<CatchSwitchInst>(IncomingTerminator)) {
4657 Builder.SetInsertPoint(VecI->getParent(),
4658 std::next(VecI->getIterator()));
4659 } else {
4660 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
4661 }
4662 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
4663 Ex = extend(ScalarRoot, Ex, Scalar->getType());
4664 CSEBlocks.insert(PH->getIncomingBlock(i));
4665 PH->setOperand(i, Ex);
4666 }
4667 }
4668 } else {
4669 Builder.SetInsertPoint(cast<Instruction>(User));
4670 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
4671 Ex = extend(ScalarRoot, Ex, Scalar->getType());
4672 CSEBlocks.insert(cast<Instruction>(User)->getParent());
4673 User->replaceUsesOfWith(Scalar, Ex);
4674 }
4675 } else {
4676 Builder.SetInsertPoint(&F->getEntryBlock().front());
4677 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
4678 Ex = extend(ScalarRoot, Ex, Scalar->getType());
4679 CSEBlocks.insert(&F->getEntryBlock());
4680 User->replaceUsesOfWith(Scalar, Ex);
4681 }
4682
4683 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Replaced:" << *User <<
".\n"; } } while (false)
;
4684 }
4685
4686 // For each vectorized value:
4687 for (auto &TEPtr : VectorizableTree) {
4688 TreeEntry *Entry = TEPtr.get();
4689
4690 // No need to handle users of gathered values.
4691 if (Entry->State == TreeEntry::NeedToGather)
4692 continue;
4693
4694 assert(Entry->VectorizedValue && "Can't find vectorizable value")((Entry->VectorizedValue && "Can't find vectorizable value"
) ? static_cast<void> (0) : __assert_fail ("Entry->VectorizedValue && \"Can't find vectorizable value\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4694, __PRETTY_FUNCTION__))
;
4695
4696 // For each lane:
4697 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4698 Value *Scalar = Entry->Scalars[Lane];
4699
4700#ifndef NDEBUG
4701 Type *Ty = Scalar->getType();
4702 if (!Ty->isVoidTy()) {
4703 for (User *U : Scalar->users()) {
4704 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tvalidating user:" <<
*U << ".\n"; } } while (false)
;
4705
4706 // It is legal to delete users in the ignorelist.
4707 assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&(((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
"Deleting out-of-tree value") ? static_cast<void> (0) :
__assert_fail ("(getTreeEntry(U) || is_contained(UserIgnoreList, U)) && \"Deleting out-of-tree value\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4708, __PRETTY_FUNCTION__))
4708 "Deleting out-of-tree value")(((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
"Deleting out-of-tree value") ? static_cast<void> (0) :
__assert_fail ("(getTreeEntry(U) || is_contained(UserIgnoreList, U)) && \"Deleting out-of-tree value\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4708, __PRETTY_FUNCTION__))
;
4709 }
4710 }
4711#endif
4712 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tErasing scalar:" << *
Scalar << ".\n"; } } while (false)
;
4713 eraseInstruction(cast<Instruction>(Scalar));
4714 }
4715 }
4716
4717 Builder.ClearInsertionPoint();
4718
4719 return VectorizableTree[0]->VectorizedValue;
4720}
4721
4722void BoUpSLP::optimizeGatherSequence() {
4723 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Optimizing " << GatherSeq
.size() << " gather sequences instructions.\n"; } } while
(false)
1
Assuming 'DebugFlag' is false
2
Loop condition is false. Exiting loop
4724 << " gather sequences instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Optimizing " << GatherSeq
.size() << " gather sequences instructions.\n"; } } while
(false)
;
4725 // LICM InsertElementInst sequences.
4726 for (Instruction *I : GatherSeq) {
4727 if (isDeleted(I))
4728 continue;
4729
4730 // Check if this block is inside a loop.
4731 Loop *L = LI->getLoopFor(I->getParent());
4732 if (!L)
4733 continue;
4734
4735 // Check if it has a preheader.
4736 BasicBlock *PreHeader = L->getLoopPreheader();
4737 if (!PreHeader)
4738 continue;
4739
4740 // If the vector or the element that we insert into it are
4741 // instructions that are defined in this basic block then we can't
4742 // hoist this instruction.
4743 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
4744 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
4745 if (Op0 && L->contains(Op0))
4746 continue;
4747 if (Op1 && L->contains(Op1))
4748 continue;
4749
4750 // We can hoist this instruction. Move it to the pre-header.
4751 I->moveBefore(PreHeader->getTerminator());
4752 }
4753
4754 // Make a list of all reachable blocks in our CSE queue.
4755 SmallVector<const DomTreeNode *, 8> CSEWorkList;
4756 CSEWorkList.reserve(CSEBlocks.size());
4757 for (BasicBlock *BB : CSEBlocks)
4758 if (DomTreeNode *N = DT->getNode(BB)) {
4759 assert(DT->isReachableFromEntry(N))((DT->isReachableFromEntry(N)) ? static_cast<void> (
0) : __assert_fail ("DT->isReachableFromEntry(N)", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4759, __PRETTY_FUNCTION__))
;
4760 CSEWorkList.push_back(N);
4761 }
4762
4763 // Sort blocks by domination. This ensures we visit a block after all blocks
4764 // dominating it are visited.
4765 llvm::stable_sort(CSEWorkList,
4766 [this](const DomTreeNode *A, const DomTreeNode *B) {
4767 return DT->properlyDominates(A, B);
4768 });
4769
4770 // Perform O(N^2) search over the gather sequences and merge identical
4771 // instructions. TODO: We can further optimize this scan if we split the
4772 // instructions into different buckets based on the insert lane.
4773 SmallVector<Instruction *, 16> Visited;
4774 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
3
Assuming 'I' is not equal to 'E'
4
Loop condition is true. Entering loop body
4775 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&(((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev
(I))) && "Worklist not sorted properly!") ? static_cast
<void> (0) : __assert_fail ("(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4776, __PRETTY_FUNCTION__))
5
Assuming the condition is false
6
Calling 'DominatorTreeBase::dominates'
13
Returning from 'DominatorTreeBase::dominates'
14
'?' condition is true
4776 "Worklist not sorted properly!")(((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev
(I))) && "Worklist not sorted properly!") ? static_cast
<void> (0) : __assert_fail ("(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4776, __PRETTY_FUNCTION__))
;
4777 BasicBlock *BB = (*I)->getBlock();
15
Called C++ object pointer is null
4778 // For all instructions in blocks containing gather sequences:
4779 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
4780 Instruction *In = &*it++;
4781 if (isDeleted(In))
4782 continue;
4783 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
4784 continue;
4785
4786 // Check if we can replace this instruction with any of the
4787 // visited instructions.
4788 for (Instruction *v : Visited) {
4789 if (In->isIdenticalTo(v) &&
4790 DT->dominates(v->getParent(), In->getParent())) {
4791 In->replaceAllUsesWith(v);
4792 eraseInstruction(In);
4793 In = nullptr;
4794 break;
4795 }
4796 }
4797 if (In) {
4798 assert(!is_contained(Visited, In))((!is_contained(Visited, In)) ? static_cast<void> (0) :
__assert_fail ("!is_contained(Visited, In)", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4798, __PRETTY_FUNCTION__))
;
4799 Visited.push_back(In);
4800 }
4801 }
4802 }
4803 CSEBlocks.clear();
4804 GatherSeq.clear();
4805}
4806
4807// Groups the instructions to a bundle (which is then a single scheduling entity)
4808// and schedules instructions until the bundle gets ready.
4809Optional<BoUpSLP::ScheduleData *>
4810BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4811 const InstructionsState &S) {
4812 if (isa<PHINode>(S.OpValue))
4813 return nullptr;
4814
4815 // Initialize the instruction bundle.
4816 Instruction *OldScheduleEnd = ScheduleEnd;
4817 ScheduleData *PrevInBundle = nullptr;
4818 ScheduleData *Bundle = nullptr;
4819 bool ReSchedule = false;
4820 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle: " << *S.OpValue
<< "\n"; } } while (false)
;
4821
4822 // Make sure that the scheduling region contains all
4823 // instructions of the bundle.
4824 for (Value *V : VL) {
4825 if (!extendSchedulingRegion(V, S))
4826 return None;
4827 }
4828
4829 for (Value *V : VL) {
4830 ScheduleData *BundleMember = getScheduleData(V);
4831 assert(BundleMember &&((BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"
) ? static_cast<void> (0) : __assert_fail ("BundleMember && \"no ScheduleData for bundle member (maybe not in same basic block)\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4832, __PRETTY_FUNCTION__))
4832 "no ScheduleData for bundle member (maybe not in same basic block)")((BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"
) ? static_cast<void> (0) : __assert_fail ("BundleMember && \"no ScheduleData for bundle member (maybe not in same basic block)\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4832, __PRETTY_FUNCTION__))
;
4833 if (BundleMember->IsScheduled) {
4834 // A bundle member was scheduled as single instruction before and now
4835 // needs to be scheduled as part of the bundle. We just get rid of the
4836 // existing schedule.
4837 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMemberdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: reset schedule because " <<
*BundleMember << " was already scheduled\n"; } } while
(false)
4838 << " was already scheduled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: reset schedule because " <<
*BundleMember << " was already scheduled\n"; } } while
(false)
;
4839 ReSchedule = true;
4840 }
4841 assert(BundleMember->isSchedulingEntity() &&((BundleMember->isSchedulingEntity() && "bundle member already part of other bundle"
) ? static_cast<void> (0) : __assert_fail ("BundleMember->isSchedulingEntity() && \"bundle member already part of other bundle\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4842, __PRETTY_FUNCTION__))
4842 "bundle member already part of other bundle")((BundleMember->isSchedulingEntity() && "bundle member already part of other bundle"
) ? static_cast<void> (0) : __assert_fail ("BundleMember->isSchedulingEntity() && \"bundle member already part of other bundle\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4842, __PRETTY_FUNCTION__))
;
4843 if (PrevInBundle) {
4844 PrevInBundle->NextInBundle = BundleMember;
4845 } else {
4846 Bundle = BundleMember;
4847 }
4848 BundleMember->UnscheduledDepsInBundle = 0;
4849 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
4850
4851 // Group the instructions to a bundle.
4852 BundleMember->FirstInBundle = Bundle;
4853 PrevInBundle = BundleMember;
4854 }
4855 if (ScheduleEnd != OldScheduleEnd) {
4856 // The scheduling region got new instructions at the lower end (or it is a
4857 // new region for the first bundle). This makes it necessary to
4858 // recalculate all dependencies.
4859 // It is seldom that this needs to be done a second time after adding the
4860 // initial bundle to the region.
4861 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4862 doForAllOpcodes(I, [](ScheduleData *SD) {
4863 SD->clearDependencies();
4864 });
4865 }
4866 ReSchedule = true;
4867 }
4868 if (ReSchedule) {
4869 resetSchedule();
4870 initialFillReadyList(ReadyInsts);
4871 }
4872 assert(Bundle && "Failed to find schedule bundle")((Bundle && "Failed to find schedule bundle") ? static_cast
<void> (0) : __assert_fail ("Bundle && \"Failed to find schedule bundle\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4872, __PRETTY_FUNCTION__))
;
4873
4874 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: try schedule bundle " <<
*Bundle << " in block " << BB->getName() <<
"\n"; } } while (false)
4875 << BB->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: try schedule bundle " <<
*Bundle << " in block " << BB->getName() <<
"\n"; } } while (false)
;
4876
4877 calculateDependencies(Bundle, true, SLP);
4878
4879 // Now try to schedule the new bundle. As soon as the bundle is "ready" it
4880 // means that there are no cyclic dependencies and we can schedule it.
4881 // Note that's important that we don't "schedule" the bundle yet (see
4882 // cancelScheduling).
4883 while (!Bundle->isReady() && !ReadyInsts.empty()) {
4884
4885 ScheduleData *pickedSD = ReadyInsts.back();
4886 ReadyInsts.pop_back();
4887
4888 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
4889 schedule(pickedSD, ReadyInsts);
4890 }
4891 }
4892 if (!Bundle->isReady()) {
4893 cancelScheduling(VL, S.OpValue);
4894 return None;
4895 }
4896 return Bundle;
4897}
4898
4899void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
4900 Value *OpValue) {
4901 if (isa<PHINode>(OpValue))
4902 return;
4903
4904 ScheduleData *Bundle = getScheduleData(OpValue);
4905 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: cancel scheduling of " <<
*Bundle << "\n"; } } while (false)
;
4906 assert(!Bundle->IsScheduled &&((!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"
) ? static_cast<void> (0) : __assert_fail ("!Bundle->IsScheduled && \"Can't cancel bundle which is already scheduled\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4907, __PRETTY_FUNCTION__))
4907 "Can't cancel bundle which is already scheduled")((!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"
) ? static_cast<void> (0) : __assert_fail ("!Bundle->IsScheduled && \"Can't cancel bundle which is already scheduled\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4907, __PRETTY_FUNCTION__))
;
4908 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&((Bundle->isSchedulingEntity() && Bundle->isPartOfBundle
() && "tried to unbundle something which is not a bundle"
) ? static_cast<void> (0) : __assert_fail ("Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && \"tried to unbundle something which is not a bundle\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4909, __PRETTY_FUNCTION__))
4909 "tried to unbundle something which is not a bundle")((Bundle->isSchedulingEntity() && Bundle->isPartOfBundle
() && "tried to unbundle something which is not a bundle"
) ? static_cast<void> (0) : __assert_fail ("Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && \"tried to unbundle something which is not a bundle\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4909, __PRETTY_FUNCTION__))
;
4910
4911 // Un-bundle: make single instructions out of the bundle.
4912 ScheduleData *BundleMember = Bundle;
4913 while (BundleMember) {
4914 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links")((BundleMember->FirstInBundle == Bundle && "corrupt bundle links"
) ? static_cast<void> (0) : __assert_fail ("BundleMember->FirstInBundle == Bundle && \"corrupt bundle links\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4914, __PRETTY_FUNCTION__))
;
4915 BundleMember->FirstInBundle = BundleMember;
4916 ScheduleData *Next = BundleMember->NextInBundle;
4917 BundleMember->NextInBundle = nullptr;
4918 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
4919 if (BundleMember->UnscheduledDepsInBundle == 0) {
4920 ReadyInsts.insert(BundleMember);
4921 }
4922 BundleMember = Next;
4923 }
4924}
4925
4926BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
4927 // Allocate a new ScheduleData for the instruction.
4928 if (ChunkPos >= ChunkSize) {
4929 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
4930 ChunkPos = 0;
4931 }
4932 return &(ScheduleDataChunks.back()[ChunkPos++]);
4933}
4934
4935bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
4936 const InstructionsState &S) {
4937 if (getScheduleData(V, isOneOf(S, V)))
4938 return true;
4939 Instruction *I = dyn_cast<Instruction>(V);
4940 assert(I && "bundle member must be an instruction")((I && "bundle member must be an instruction") ? static_cast
<void> (0) : __assert_fail ("I && \"bundle member must be an instruction\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4940, __PRETTY_FUNCTION__))
;
4941 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled")((!isa<PHINode>(I) && "phi nodes don't need to be scheduled"
) ? static_cast<void> (0) : __assert_fail ("!isa<PHINode>(I) && \"phi nodes don't need to be scheduled\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4941, __PRETTY_FUNCTION__))
;
4942 auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
4943 ScheduleData *ISD = getScheduleData(I);
4944 if (!ISD)
4945 return false;
4946 assert(isInSchedulingRegion(ISD) &&((isInSchedulingRegion(ISD) && "ScheduleData not in scheduling region"
) ? static_cast<void> (0) : __assert_fail ("isInSchedulingRegion(ISD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4947, __PRETTY_FUNCTION__))
4947 "ScheduleData not in scheduling region")((isInSchedulingRegion(ISD) && "ScheduleData not in scheduling region"
) ? static_cast<void> (0) : __assert_fail ("isInSchedulingRegion(ISD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4947, __PRETTY_FUNCTION__))
;
4948 ScheduleData *SD = allocateScheduleDataChunks();
4949 SD->Inst = I;
4950 SD->init(SchedulingRegionID, S.OpValue);
4951 ExtraScheduleDataMap[I][S.OpValue] = SD;
4952 return true;
4953 };
4954 if (CheckSheduleForI(I))
4955 return true;
4956 if (!ScheduleStart) {
4957 // It's the first instruction in the new region.
4958 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
4959 ScheduleStart = I;
4960 ScheduleEnd = I->getNextNode();
4961 if (isOneOf(S, I) != I)
4962 CheckSheduleForI(I);
4963 assert(ScheduleEnd && "tried to vectorize a terminator?")((ScheduleEnd && "tried to vectorize a terminator?") ?
static_cast<void> (0) : __assert_fail ("ScheduleEnd && \"tried to vectorize a terminator?\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4963, __PRETTY_FUNCTION__))
;
4964 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initialize schedule region to "
<< *I << "\n"; } } while (false)
;
4965 return true;
4966 }
4967 // Search up and down at the same time, because we don't know if the new
4968 // instruction is above or below the existing scheduling region.
4969 BasicBlock::reverse_iterator UpIter =
4970 ++ScheduleStart->getIterator().getReverse();
4971 BasicBlock::reverse_iterator UpperEnd = BB->rend();
4972 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
4973 BasicBlock::iterator LowerEnd = BB->end();
4974 while (true) {
4975 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
4976 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: exceeded schedule region size limit\n"
; } } while (false)
;
4977 return false;
4978 }
4979
4980 if (UpIter != UpperEnd) {
4981 if (&*UpIter == I) {
4982 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
4983 ScheduleStart = I;
4984 if (isOneOf(S, I) != I)
4985 CheckSheduleForI(I);
4986 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region start to "
<< *I << "\n"; } } while (false)
4987 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region start to "
<< *I << "\n"; } } while (false)
;
4988 return true;
4989 }
4990 ++UpIter;
4991 }
4992 if (DownIter != LowerEnd) {
4993 if (&*DownIter == I) {
4994 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
4995 nullptr);
4996 ScheduleEnd = I->getNextNode();
4997 if (isOneOf(S, I) != I)
4998 CheckSheduleForI(I);
4999 assert(ScheduleEnd && "tried to vectorize a terminator?")((ScheduleEnd && "tried to vectorize a terminator?") ?
static_cast<void> (0) : __assert_fail ("ScheduleEnd && \"tried to vectorize a terminator?\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4999, __PRETTY_FUNCTION__))
;
5000 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region end to "
<< *I << "\n"; } } while (false)
5001 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region end to "
<< *I << "\n"; } } while (false)
;
5002 return true;
5003 }
5004 ++DownIter;
5005 }
5006 assert((UpIter != UpperEnd || DownIter != LowerEnd) &&(((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"
) ? static_cast<void> (0) : __assert_fail ("(UpIter != UpperEnd || DownIter != LowerEnd) && \"instruction not found in block\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5007, __PRETTY_FUNCTION__))
5007 "instruction not found in block")(((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"
) ? static_cast<void> (0) : __assert_fail ("(UpIter != UpperEnd || DownIter != LowerEnd) && \"instruction not found in block\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5007, __PRETTY_FUNCTION__))
;
5008 }
5009 return true;
5010}
5011
5012void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
5013 Instruction *ToI,
5014 ScheduleData *PrevLoadStore,
5015 ScheduleData *NextLoadStore) {
5016 ScheduleData *CurrentLoadStore = PrevLoadStore;
5017 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
5018 ScheduleData *SD = ScheduleDataMap[I];
5019 if (!SD) {
5020 SD = allocateScheduleDataChunks();
5021 ScheduleDataMap[I] = SD;
5022 SD->Inst = I;
5023 }
5024 assert(!isInSchedulingRegion(SD) &&((!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"
) ? static_cast<void> (0) : __assert_fail ("!isInSchedulingRegion(SD) && \"new ScheduleData already in scheduling region\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5025, __PRETTY_FUNCTION__))
5025 "new ScheduleData already in scheduling region")((!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"
) ? static_cast<void> (0) : __assert_fail ("!isInSchedulingRegion(SD) && \"new ScheduleData already in scheduling region\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5025, __PRETTY_FUNCTION__))
;
5026 SD->init(SchedulingRegionID, I);
5027
5028 if (I->mayReadOrWriteMemory() &&
5029 (!isa<IntrinsicInst>(I) ||
5030 cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
5031 // Update the linked list of memory accessing instructions.
5032 if (CurrentLoadStore) {
5033 CurrentLoadStore->NextLoadStore = SD;
5034 } else {
5035 FirstLoadStoreInRegion = SD;
5036 }
5037 CurrentLoadStore = SD;
5038 }
5039 }
5040 if (NextLoadStore) {
5041 if (CurrentLoadStore)
5042 CurrentLoadStore->NextLoadStore = NextLoadStore;
5043 } else {
5044 LastLoadStoreInRegion = CurrentLoadStore;
5045 }
5046}
5047
5048void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
5049 bool InsertInReadyList,
5050 BoUpSLP *SLP) {
5051 assert(SD->isSchedulingEntity())((SD->isSchedulingEntity()) ? static_cast<void> (0) :
__assert_fail ("SD->isSchedulingEntity()", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5051, __PRETTY_FUNCTION__))
;
5052
5053 SmallVector<ScheduleData *, 10> WorkList;
5054 WorkList.push_back(SD);
5055
5056 while (!WorkList.empty()) {
5057 ScheduleData *SD = WorkList.back();
5058 WorkList.pop_back();
5059
5060 ScheduleData *BundleMember = SD;
5061 while (BundleMember) {
5062 assert(isInSchedulingRegion(BundleMember))((isInSchedulingRegion(BundleMember)) ? static_cast<void>
(0) : __assert_fail ("isInSchedulingRegion(BundleMember)", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5062, __PRETTY_FUNCTION__))
;
5063 if (!BundleMember->hasValidDependencies()) {
5064
5065 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMemberdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: update deps of " <<
*BundleMember << "\n"; } } while (false)
5066 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: update deps of " <<
*BundleMember << "\n"; } } while (false)
;
5067 BundleMember->Dependencies = 0;
5068 BundleMember->resetUnscheduledDeps();
5069
5070 // Handle def-use chain dependencies.
5071 if (BundleMember->OpValue != BundleMember->Inst) {
5072 ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
5073 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
5074 BundleMember->Dependencies++;
5075 ScheduleData *DestBundle = UseSD->FirstInBundle;
5076 if (!DestBundle->IsScheduled)
5077 BundleMember->incrementUnscheduledDeps(1);
5078 if (!DestBundle->hasValidDependencies())
5079 WorkList.push_back(DestBundle);
5080 }
5081 } else {
5082 for (User *U : BundleMember->Inst->users()) {
5083 if (isa<Instruction>(U)) {
5084 ScheduleData *UseSD = getScheduleData(U);
5085 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
5086 BundleMember->Dependencies++;
5087 ScheduleData *DestBundle = UseSD->FirstInBundle;
5088 if (!DestBundle->IsScheduled)
5089 BundleMember->incrementUnscheduledDeps(1);
5090 if (!DestBundle->hasValidDependencies())
5091 WorkList.push_back(DestBundle);
5092 }
5093 } else {
5094 // I'm not sure if this can ever happen. But we need to be safe.
5095 // This lets the instruction/bundle never be scheduled and
5096 // eventually disable vectorization.
5097 BundleMember->Dependencies++;
5098 BundleMember->incrementUnscheduledDeps(1);
5099 }
5100 }
5101 }
5102
5103 // Handle the memory dependencies.
5104 ScheduleData *DepDest = BundleMember->NextLoadStore;
5105 if (DepDest) {
5106 Instruction *SrcInst = BundleMember->Inst;
5107 MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
5108 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
5109 unsigned numAliased = 0;
5110 unsigned DistToSrc = 1;
5111
5112 while (DepDest) {
5113 assert(isInSchedulingRegion(DepDest))((isInSchedulingRegion(DepDest)) ? static_cast<void> (0
) : __assert_fail ("isInSchedulingRegion(DepDest)", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5113, __PRETTY_FUNCTION__))
;
5114
5115 // We have two limits to reduce the complexity:
5116 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
5117 // SLP->isAliased (which is the expensive part in this loop).
5118 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
5119 // the whole loop (even if the loop is fast, it's quadratic).
5120 // It's important for the loop break condition (see below) to
5121 // check this limit even between two read-only instructions.
5122 if (DistToSrc >= MaxMemDepDistance ||
5123 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
5124 (numAliased >= AliasedCheckLimit ||
5125 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
5126
5127 // We increment the counter only if the locations are aliased
5128 // (instead of counting all alias checks). This gives a better
5129 // balance between reduced runtime and accurate dependencies.
5130 numAliased++;
5131
5132 DepDest->MemoryDependencies.push_back(BundleMember);
5133 BundleMember->Dependencies++;
5134 ScheduleData *DestBundle = DepDest->FirstInBundle;
5135 if (!DestBundle->IsScheduled) {
5136 BundleMember->incrementUnscheduledDeps(1);
5137 }
5138 if (!DestBundle->hasValidDependencies()) {
5139 WorkList.push_back(DestBundle);
5140 }
5141 }
5142 DepDest = DepDest->NextLoadStore;
5143
5144 // Example, explaining the loop break condition: Let's assume our
5145 // starting instruction is i0 and MaxMemDepDistance = 3.
5146 //
5147 // +--------v--v--v
5148 // i0,i1,i2,i3,i4,i5,i6,i7,i8
5149 // +--------^--^--^
5150 //
5151 // MaxMemDepDistance let us stop alias-checking at i3 and we add
5152 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
5153 // Previously we already added dependencies from i3 to i6,i7,i8
5154 // (because of MaxMemDepDistance). As we added a dependency from
5155 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
5156 // and we can abort this loop at i6.
5157 if (DistToSrc >= 2 * MaxMemDepDistance)
5158 break;
5159 DistToSrc++;
5160 }
5161 }
5162 }
5163 BundleMember = BundleMember->NextInBundle;
5164 }
5165 if (InsertInReadyList && SD->isReady()) {
5166 ReadyInsts.push_back(SD);
5167 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Instdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready on update: " <<
*SD->Inst << "\n"; } } while (false)
5168 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready on update: " <<
*SD->Inst << "\n"; } } while (false)
;
5169 }
5170 }
5171}
5172
5173void BoUpSLP::BlockScheduling::resetSchedule() {
5174 assert(ScheduleStart &&((ScheduleStart && "tried to reset schedule on block which has not been scheduled"
) ? static_cast<void> (0) : __assert_fail ("ScheduleStart && \"tried to reset schedule on block which has not been scheduled\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5175, __PRETTY_FUNCTION__))
5175 "tried to reset schedule on block which has not been scheduled")((ScheduleStart && "tried to reset schedule on block which has not been scheduled"
) ? static_cast<void> (0) : __assert_fail ("ScheduleStart && \"tried to reset schedule on block which has not been scheduled\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5175, __PRETTY_FUNCTION__))
;
5176 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5177 doForAllOpcodes(I, [&](ScheduleData *SD) {
5178 assert(isInSchedulingRegion(SD) &&((isInSchedulingRegion(SD) && "ScheduleData not in scheduling region"
) ? static_cast<void> (0) : __assert_fail ("isInSchedulingRegion(SD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5179, __PRETTY_FUNCTION__))
5179 "ScheduleData not in scheduling region")((isInSchedulingRegion(SD) && "ScheduleData not in scheduling region"
) ? static_cast<void> (0) : __assert_fail ("isInSchedulingRegion(SD) && \"ScheduleData not in scheduling region\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5179, __PRETTY_FUNCTION__))
;
5180 SD->IsScheduled = false;
5181 SD->resetUnscheduledDeps();
5182 });
5183 }
5184 ReadyInsts.clear();
5185}
5186
5187void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
5188 if (!BS->ScheduleStart)
5189 return;
5190
5191 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule block " << BS
->BB->getName() << "\n"; } } while (false)
;
5192
5193 BS->resetSchedule();
5194
5195 // For the real scheduling we use a more sophisticated ready-list: it is
5196 // sorted by the original instruction location. This lets the final schedule
5197 // be as close as possible to the original instruction order.
5198 struct ScheduleDataCompare {
5199 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
5200 return SD2->SchedulingPriority < SD1->SchedulingPriority;
5201 }
5202 };
5203 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
5204
5205 // Ensure that all dependency data is updated and fill the ready-list with
5206 // initial instructions.
5207 int Idx = 0;
5208 int NumToSchedule = 0;
5209 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
5210 I = I->getNextNode()) {
5211 BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
5212 assert(SD->isPartOfBundle() ==((SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr
) && "scheduler and vectorizer bundle mismatch") ? static_cast
<void> (0) : __assert_fail ("SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && \"scheduler and vectorizer bundle mismatch\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5214, __PRETTY_FUNCTION__))
5213 (getTreeEntry(SD->Inst) != nullptr) &&((SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr
) && "scheduler and vectorizer bundle mismatch") ? static_cast
<void> (0) : __assert_fail ("SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && \"scheduler and vectorizer bundle mismatch\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5214, __PRETTY_FUNCTION__))
5214 "scheduler and vectorizer bundle mismatch")((SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr
) && "scheduler and vectorizer bundle mismatch") ? static_cast
<void> (0) : __assert_fail ("SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && \"scheduler and vectorizer bundle mismatch\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5214, __PRETTY_FUNCTION__))
;
5215 SD->FirstInBundle->SchedulingPriority = Idx++;
5216 if (SD->isSchedulingEntity()) {
5217 BS->calculateDependencies(SD, false, this);
5218 NumToSchedule++;
5219 }
5220 });
5221 }
5222 BS->initialFillReadyList(ReadyInsts);
5223
5224 Instruction *LastScheduledInst = BS->ScheduleEnd;
5225
5226 // Do the "real" scheduling.
5227 while (!ReadyInsts.empty()) {
5228 ScheduleData *picked = *ReadyInsts.begin();
5229 ReadyInsts.erase(ReadyInsts.begin());
5230
5231 // Move the scheduled instruction(s) to their dedicated places, if not
5232 // there yet.
5233 ScheduleData *BundleMember = picked;
5234 while (BundleMember) {
5235 Instruction *pickedInst = BundleMember->Inst;
5236 if (LastScheduledInst->getNextNode() != pickedInst) {
5237 BS->BB->getInstList().remove(pickedInst);
5238 BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
5239 pickedInst);
5240 }
5241 LastScheduledInst = pickedInst;
5242 BundleMember = BundleMember->NextInBundle;
5243 }
5244
5245 BS->schedule(picked, ReadyInsts);
5246 NumToSchedule--;
5247 }
5248 assert(NumToSchedule == 0 && "could not schedule all instructions")((NumToSchedule == 0 && "could not schedule all instructions"
) ? static_cast<void> (0) : __assert_fail ("NumToSchedule == 0 && \"could not schedule all instructions\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5248, __PRETTY_FUNCTION__))
;
5249
5250 // Avoid duplicate scheduling of the block.
5251 BS->ScheduleStart = nullptr;
5252}
5253
5254unsigned BoUpSLP::getVectorElementSize(Value *V) const {
5255 // If V is a store, just return the width of the stored value without
5256 // traversing the expression tree. This is the common case.
5257 if (auto *Store = dyn_cast<StoreInst>(V))
5258 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
5259
5260 // If V is not a store, we can traverse the expression tree to find loads
5261 // that feed it. The type of the loaded value may indicate a more suitable
5262 // width than V's type. We want to base the vector element size on the width
5263 // of memory operations where possible.
5264 SmallVector<Instruction *, 16> Worklist;
5265 SmallPtrSet<Instruction *, 16> Visited;
5266 if (auto *I = dyn_cast<Instruction>(V))
5267 Worklist.push_back(I);
5268
5269 // Traverse the expression tree in bottom-up order looking for loads. If we
5270 // encounter an instruction we don't yet handle, we give up.
5271 auto MaxWidth = 0u;
5272 auto FoundUnknownInst = false;
5273 while (!Worklist.empty() && !FoundUnknownInst) {
5274 auto *I = Worklist.pop_back_val();
5275 Visited.insert(I);
5276
5277 // We should only be looking at scalar instructions here. If the current
5278 // instruction has a vector type, give up.
5279 auto *Ty = I->getType();
5280 if (isa<VectorType>(Ty))
5281 FoundUnknownInst = true;
5282
5283 // If the current instruction is a load, update MaxWidth to reflect the
5284 // width of the loaded value.
5285 else if (isa<LoadInst>(I))
5286 MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
5287
5288 // Otherwise, we need to visit the operands of the instruction. We only
5289 // handle the interesting cases from buildTree here. If an operand is an
5290 // instruction we haven't yet visited, we add it to the worklist.
5291 else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
5292 isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
5293 for (Use &U : I->operands())
5294 if (auto *J = dyn_cast<Instruction>(U.get()))
5295 if (!Visited.count(J))
5296 Worklist.push_back(J);
5297 }
5298
5299 // If we don't yet handle the instruction, give up.
5300 else
5301 FoundUnknownInst = true;
5302 }
5303
5304 // If we didn't encounter a memory access in the expression tree, or if we
5305 // gave up for some reason, just return the width of V.
5306 if (!MaxWidth || FoundUnknownInst)
5307 return DL->getTypeSizeInBits(V->getType());
5308
5309 // Otherwise, return the maximum width we found.
5310 return MaxWidth;
5311}
5312
5313// Determine if a value V in a vectorizable expression Expr can be demoted to a
5314// smaller type with a truncation. We collect the values that will be demoted
5315// in ToDemote and additional roots that require investigating in Roots.
5316static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
5317 SmallVectorImpl<Value *> &ToDemote,
5318 SmallVectorImpl<Value *> &Roots) {
5319 // We can always demote constants.
5320 if (isa<Constant>(V)) {
5321 ToDemote.push_back(V);
5322 return true;
5323 }
5324
5325 // If the value is not an instruction in the expression with only one use, it
5326 // cannot be demoted.
5327 auto *I = dyn_cast<Instruction>(V);
5328 if (!I || !I->hasOneUse() || !Expr.count(I))
5329 return false;
5330
5331 switch (I->getOpcode()) {
5332
5333 // We can always demote truncations and extensions. Since truncations can
5334 // seed additional demotion, we save the truncated value.
5335 case Instruction::Trunc:
5336 Roots.push_back(I->getOperand(0));
5337 break;
5338 case Instruction::ZExt:
5339 case Instruction::SExt:
5340 break;
5341
5342 // We can demote certain binary operations if we can demote both of their
5343 // operands.
5344 case Instruction::Add:
5345 case Instruction::Sub:
5346 case Instruction::Mul:
5347 case Instruction::And:
5348 case Instruction::Or:
5349 case Instruction::Xor:
5350 if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
5351 !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
5352 return false;
5353 break;
5354
5355 // We can demote selects if we can demote their true and false values.
5356 case Instruction::Select: {
5357 SelectInst *SI = cast<SelectInst>(I);
5358 if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
5359 !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
5360 return false;
5361 break;
5362 }
5363
5364 // We can demote phis if we can demote all their incoming operands. Note that
5365 // we don't need to worry about cycles since we ensure single use above.
5366 case Instruction::PHI: {
5367 PHINode *PN = cast<PHINode>(I);
5368 for (Value *IncValue : PN->incoming_values())
5369 if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
5370 return false;
5371 break;
5372 }
5373
5374 // Otherwise, conservatively give up.
5375 default:
5376 return false;
5377 }
5378
5379 // Record the value that we can demote.
5380 ToDemote.push_back(V);
5381 return true;
5382}
5383
5384void BoUpSLP::computeMinimumValueSizes() {
5385 // If there are no external uses, the expression tree must be rooted by a
5386 // store. We can't demote in-memory values, so there is nothing to do here.
5387 if (ExternalUses.empty())
5388 return;
5389
5390 // We only attempt to truncate integer expressions.
5391 auto &TreeRoot = VectorizableTree[0]->Scalars;
5392 auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
5393 if (!TreeRootIT)
5394 return;
5395
5396 // If the expression is not rooted by a store, these roots should have
5397 // external uses. We will rely on InstCombine to rewrite the expression in
5398 // the narrower type. However, InstCombine only rewrites single-use values.
5399 // This means that if a tree entry other than a root is used externally, it
5400 // must have multiple uses and InstCombine will not rewrite it. The code
5401 // below ensures that only the roots are used externally.
5402 SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
5403 for (auto &EU : ExternalUses)
5404 if (!Expr.erase(EU.Scalar))
5405 return;
5406 if (!Expr.empty())
5407 return;
5408
5409 // Collect the scalar values of the vectorizable expression. We will use this
5410 // context to determine which values can be demoted. If we see a truncation,
5411 // we mark it as seeding another demotion.
5412 for (auto &EntryPtr : VectorizableTree)
5413 Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
5414
5415 // Ensure the roots of the vectorizable tree don't form a cycle. They must
5416 // have a single external user that is not in the vectorizable tree.
5417 for (auto *Root : TreeRoot)
5418 if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
5419 return;
5420
5421 // Conservatively determine if we can actually truncate the roots of the
5422 // expression. Collect the values that can be demoted in ToDemote and
5423 // additional roots that require investigating in Roots.
5424 SmallVector<Value *, 32> ToDemote;
5425 SmallVector<Value *, 4> Roots;
5426 for (auto *Root : TreeRoot)
5427 if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
5428 return;
5429
5430 // The maximum bit width required to represent all the values that can be
5431 // demoted without loss of precision. It would be safe to truncate the roots
5432 // of the expression to this width.
5433 auto MaxBitWidth = 8u;
5434
5435 // We first check if all the bits of the roots are demanded. If they're not,
5436 // we can truncate the roots to this narrower type.
5437 for (auto *Root : TreeRoot) {
5438 auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
5439 MaxBitWidth = std::max<unsigned>(
5440 Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
5441 }
5442
5443 // True if the roots can be zero-extended back to their original type, rather
5444 // than sign-extended. We know that if the leading bits are not demanded, we
5445 // can safely zero-extend. So we initialize IsKnownPositive to True.
5446 bool IsKnownPositive = true;
5447
5448 // If all the bits of the roots are demanded, we can try a little harder to
5449 // compute a narrower type. This can happen, for example, if the roots are
5450 // getelementptr indices. InstCombine promotes these indices to the pointer
5451 // width. Thus, all their bits are technically demanded even though the
5452 // address computation might be vectorized in a smaller type.
5453 //
5454 // We start by looking at each entry that can be demoted. We compute the
5455 // maximum bit width required to store the scalar by using ValueTracking to
5456 // compute the number of high-order bits we can truncate.
5457 if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
5458 llvm::all_of(TreeRoot, [](Value *R) {
5459 assert(R->hasOneUse() && "Root should have only one use!")((R->hasOneUse() && "Root should have only one use!"
) ? static_cast<void> (0) : __assert_fail ("R->hasOneUse() && \"Root should have only one use!\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5459, __PRETTY_FUNCTION__))
;
5460 return isa<GetElementPtrInst>(R->user_back());
5461 })) {
5462 MaxBitWidth = 8u;
5463
5464 // Determine if the sign bit of all the roots is known to be zero. If not,
5465 // IsKnownPositive is set to False.
5466 IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
5467 KnownBits Known = computeKnownBits(R, *DL);
5468 return Known.isNonNegative();
5469 });
5470
5471 // Determine the maximum number of bits required to store the scalar
5472 // values.
5473 for (auto *Scalar : ToDemote) {
5474 auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
5475 auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
5476 MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
5477 }
5478
5479 // If we can't prove that the sign bit is zero, we must add one to the
5480 // maximum bit width to account for the unknown sign bit. This preserves
5481 // the existing sign bit so we can safely sign-extend the root back to the
5482 // original type. Otherwise, if we know the sign bit is zero, we will
5483 // zero-extend the root instead.
5484 //
5485 // FIXME: This is somewhat suboptimal, as there will be cases where adding
5486 // one to the maximum bit width will yield a larger-than-necessary
5487 // type. In general, we need to add an extra bit only if we can't
5488 // prove that the upper bit of the original type is equal to the
5489 // upper bit of the proposed smaller type. If these two bits are the
5490 // same (either zero or one) we know that sign-extending from the
5491 // smaller type will result in the same value. Here, since we can't
5492 // yet prove this, we are just making the proposed smaller type
5493 // larger to ensure correctness.
5494 if (!IsKnownPositive)
5495 ++MaxBitWidth;
5496 }
5497
5498 // Round MaxBitWidth up to the next power-of-two.
5499 if (!isPowerOf2_64(MaxBitWidth))
5500 MaxBitWidth = NextPowerOf2(MaxBitWidth);
5501
5502 // If the maximum bit width we compute is less than the with of the roots'
5503 // type, we can proceed with the narrowing. Otherwise, do nothing.
5504 if (MaxBitWidth >= TreeRootIT->getBitWidth())
5505 return;
5506
5507 // If we can truncate the root, we must collect additional values that might
5508 // be demoted as a result. That is, those seeded by truncations we will
5509 // modify.
5510 while (!Roots.empty())
5511 collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
5512
5513 // Finally, map the values we can demote to the maximum bit with we computed.
5514 for (auto *Scalar : ToDemote)
5515 MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
5516}
5517
5518namespace {
5519
5520/// The SLPVectorizer Pass.
5521struct SLPVectorizer : public FunctionPass {
5522 SLPVectorizerPass Impl;
5523
5524 /// Pass identification, replacement for typeid
5525 static char ID;
5526
5527 explicit SLPVectorizer() : FunctionPass(ID) {
5528 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
5529 }
5530
5531 bool doInitialization(Module &M) override {
5532 return false;
5533 }
5534
5535 bool runOnFunction(Function &F) override {
5536 if (skipFunction(F))
5537 return false;
5538
5539 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
5540 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
5541 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
5542 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
5543 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
5544 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
5545 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
5546 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
5547 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
5548 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
5549
5550 return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
5551 }
5552
5553 void getAnalysisUsage(AnalysisUsage &AU) const override {
5554 FunctionPass::getAnalysisUsage(AU);
5555 AU.addRequired<AssumptionCacheTracker>();
5556 AU.addRequired<ScalarEvolutionWrapperPass>();
5557 AU.addRequired<AAResultsWrapperPass>();
5558 AU.addRequired<TargetTransformInfoWrapperPass>();
5559 AU.addRequired<LoopInfoWrapperPass>();
5560 AU.addRequired<DominatorTreeWrapperPass>();
5561 AU.addRequired<DemandedBitsWrapperPass>();
5562 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
5563 AU.addPreserved<LoopInfoWrapperPass>();
5564 AU.addPreserved<DominatorTreeWrapperPass>();
5565 AU.addPreserved<AAResultsWrapperPass>();
5566 AU.addPreserved<GlobalsAAWrapperPass>();
5567 AU.setPreservesCFG();
5568 }
5569};
5570
5571} // end anonymous namespace
5572
5573PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
5574 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
5575 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
5576 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
5577 auto *AA = &AM.getResult<AAManager>(F);
5578 auto *LI = &AM.getResult<LoopAnalysis>(F);
5579 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
5580 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
5581 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
5582 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
5583
5584 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
5585 if (!Changed)
5586 return PreservedAnalyses::all();
5587
5588 PreservedAnalyses PA;
5589 PA.preserveSet<CFGAnalyses>();
5590 PA.preserve<AAManager>();
5591 PA.preserve<GlobalsAA>();
5592 return PA;
5593}
5594
5595bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
5596 TargetTransformInfo *TTI_,
5597 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,