Bug Summary

File:llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Warning:line 2521, column 23
Access to field 'IsScheduled' results in a dereference of a null pointer (loaded from variable 'SD')

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name SLPVectorizer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/build-llvm -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/llvm/lib/Transforms/Vectorize -I include -I /build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-01-19-134126-35450-1 -x c++ /build/llvm-toolchain-snapshot-14~++20220119111520+da61cb019eb2/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/Optional.h"
23#include "llvm/ADT/PostOrderIterator.h"
24#include "llvm/ADT/PriorityQueue.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SetOperations.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/SmallBitVector.h"
29#include "llvm/ADT/SmallPtrSet.h"
30#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/SmallString.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/iterator.h"
34#include "llvm/ADT/iterator_range.h"
35#include "llvm/Analysis/AliasAnalysis.h"
36#include "llvm/Analysis/AssumptionCache.h"
37#include "llvm/Analysis/CodeMetrics.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/IVDescriptors.h"
41#include "llvm/Analysis/LoopAccessAnalysis.h"
42#include "llvm/Analysis/LoopInfo.h"
43#include "llvm/Analysis/MemoryLocation.h"
44#include "llvm/Analysis/OptimizationRemarkEmitter.h"
45#include "llvm/Analysis/ScalarEvolution.h"
46#include "llvm/Analysis/ScalarEvolutionExpressions.h"
47#include "llvm/Analysis/TargetLibraryInfo.h"
48#include "llvm/Analysis/TargetTransformInfo.h"
49#include "llvm/Analysis/ValueTracking.h"
50#include "llvm/Analysis/VectorUtils.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/DebugLoc.h"
57#include "llvm/IR/DerivedTypes.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Instructions.h"
64#include "llvm/IR/IntrinsicInst.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/NoFolder.h"
68#include "llvm/IR/Operator.h"
69#include "llvm/IR/PatternMatch.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
75#include "llvm/IR/Verifier.h"
76#include "llvm/InitializePasses.h"
77#include "llvm/Pass.h"
78#include "llvm/Support/Casting.h"
79#include "llvm/Support/CommandLine.h"
80#include "llvm/Support/Compiler.h"
81#include "llvm/Support/DOTGraphTraits.h"
82#include "llvm/Support/Debug.h"
83#include "llvm/Support/ErrorHandling.h"
84#include "llvm/Support/GraphWriter.h"
85#include "llvm/Support/InstructionCost.h"
86#include "llvm/Support/KnownBits.h"
87#include "llvm/Support/MathExtras.h"
88#include "llvm/Support/raw_ostream.h"
89#include "llvm/Transforms/Utils/InjectTLIMappings.h"
90#include "llvm/Transforms/Utils/LoopUtils.h"
91#include "llvm/Transforms/Vectorize.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <set>
98#include <string>
99#include <tuple>
100#include <utility>
101#include <vector>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME"slp-vectorizer" "slp-vectorizer"
108#define DEBUG_TYPE"SLP" "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated")static llvm::Statistic NumVectorInstructions = {"SLP", "NumVectorInstructions"
, "Number of vector instructions generated"}
;
111
112cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
113 cl::desc("Run the SLP vectorization passes"));
114
115static cl::opt<int>
116 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
117 cl::desc("Only vectorize if you gain more than this "
118 "number "));
119
120static cl::opt<bool>
121ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
122 cl::desc("Attempt to vectorize horizontal reductions"));
123
124static cl::opt<bool> ShouldStartVectorizeHorAtStore(
125 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
126 cl::desc(
127 "Attempt to vectorize horizontal reductions feeding into a store"));
128
129static cl::opt<int>
130MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
131 cl::desc("Attempt to vectorize for this register size in bits"));
132
133static cl::opt<unsigned>
134MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
135 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
136
137static cl::opt<int>
138MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
139 cl::desc("Maximum depth of the lookup for consecutive stores."));
140
141/// Limits the size of scheduling regions in a block.
142/// It avoid long compile times for _very_ large blocks where vector
143/// instructions are spread over a wide range.
144/// This limit is way higher than needed by real-world functions.
145static cl::opt<int>
146ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
147 cl::desc("Limit the size of the SLP scheduling region per block"));
148
149static cl::opt<int> MinVectorRegSizeOption(
150 "slp-min-reg-size", cl::init(128), cl::Hidden,
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
153static cl::opt<unsigned> RecursionMaxDepth(
154 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
155 cl::desc("Limit the recursion depth when building a vectorizable tree"));
156
157static cl::opt<unsigned> MinTreeSize(
158 "slp-min-tree-size", cl::init(3), cl::Hidden,
159 cl::desc("Only vectorize small trees if they are fully vectorizable"));
160
161// The maximum depth that the look-ahead score heuristic will explore.
162// The higher this value, the higher the compilation time overhead.
163static cl::opt<int> LookAheadMaxDepth(
164 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
165 cl::desc("The maximum look-ahead depth for operand reordering scores"));
166
167// The Look-ahead heuristic goes through the users of the bundle to calculate
168// the users cost in getExternalUsesCost(). To avoid compilation time increase
169// we limit the number of users visited to this value.
170static cl::opt<unsigned> LookAheadUsersBudget(
171 "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
172 cl::desc("The maximum number of users to visit while visiting the "
173 "predecessors. This prevents compilation time increase."));
174
175static cl::opt<bool>
176 ViewSLPTree("view-slp-tree", cl::Hidden,
177 cl::desc("Display the SLP trees with Graphviz"));
178
179// Limit the number of alias checks. The limit is chosen so that
180// it has no negative effect on the llvm benchmarks.
181static const unsigned AliasedCheckLimit = 10;
182
183// Another limit for the alias checks: The maximum distance between load/store
184// instructions where alias checks are done.
185// This limit is useful for very large basic blocks.
186static const unsigned MaxMemDepDistance = 160;
187
188/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
189/// regions to be handled.
190static const int MinScheduleRegionSize = 16;
191
192/// Predicate for the element types that the SLP vectorizer supports.
193///
194/// The most important thing to filter here are types which are invalid in LLVM
195/// vectors. We also filter target specific types which have absolutely no
196/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
197/// avoids spending time checking the cost model and realizing that they will
198/// be inevitably scalarized.
199static bool isValidElementType(Type *Ty) {
200 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
201 !Ty->isPPC_FP128Ty();
202}
203
204/// \returns True if the value is a constant (but not globals/constant
205/// expressions).
206static bool isConstant(Value *V) {
207 return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
208}
209
210/// Checks if \p V is one of vector-like instructions, i.e. undef,
211/// insertelement/extractelement with constant indices for fixed vector type or
212/// extractvalue instruction.
213static bool isVectorLikeInstWithConstOps(Value *V) {
214 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
215 !isa<ExtractValueInst, UndefValue>(V))
216 return false;
217 auto *I = dyn_cast<Instruction>(V);
218 if (!I || isa<ExtractValueInst>(I))
219 return true;
220 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
221 return false;
222 if (isa<ExtractElementInst>(I))
223 return isConstant(I->getOperand(1));
224 assert(isa<InsertElementInst>(V) && "Expected only insertelement.")(static_cast <bool> (isa<InsertElementInst>(V) &&
"Expected only insertelement.") ? void (0) : __assert_fail (
"isa<InsertElementInst>(V) && \"Expected only insertelement.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 224, __extension__
__PRETTY_FUNCTION__))
;
225 return isConstant(I->getOperand(2));
226}
227
228/// \returns true if all of the instructions in \p VL are in the same block or
229/// false otherwise.
230static bool allSameBlock(ArrayRef<Value *> VL) {
231 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
232 if (!I0)
233 return false;
234 if (all_of(VL, isVectorLikeInstWithConstOps))
235 return true;
236
237 BasicBlock *BB = I0->getParent();
238 for (int I = 1, E = VL.size(); I < E; I++) {
239 auto *II = dyn_cast<Instruction>(VL[I]);
240 if (!II)
241 return false;
242
243 if (BB != II->getParent())
244 return false;
245 }
246 return true;
247}
248
249/// \returns True if all of the values in \p VL are constants (but not
250/// globals/constant expressions).
251static bool allConstant(ArrayRef<Value *> VL) {
252 // Constant expressions and globals can't be vectorized like normal integer/FP
253 // constants.
254 return all_of(VL, isConstant);
255}
256
257/// \returns True if all of the values in \p VL are identical or some of them
258/// are UndefValue.
259static bool isSplat(ArrayRef<Value *> VL) {
260 Value *FirstNonUndef = nullptr;
261 for (Value *V : VL) {
262 if (isa<UndefValue>(V))
263 continue;
264 if (!FirstNonUndef) {
265 FirstNonUndef = V;
266 continue;
267 }
268 if (V != FirstNonUndef)
269 return false;
270 }
271 return FirstNonUndef != nullptr;
272}
273
274/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
275static bool isCommutative(Instruction *I) {
276 if (auto *Cmp = dyn_cast<CmpInst>(I))
277 return Cmp->isCommutative();
278 if (auto *BO = dyn_cast<BinaryOperator>(I))
279 return BO->isCommutative();
280 // TODO: This should check for generic Instruction::isCommutative(), but
281 // we need to confirm that the caller code correctly handles Intrinsics
282 // for example (does not have 2 operands).
283 return false;
284}
285
286/// Checks if the given value is actually an undefined constant vector.
287static bool isUndefVector(const Value *V) {
288 if (isa<UndefValue>(V))
289 return true;
290 auto *C = dyn_cast<Constant>(V);
291 if (!C)
292 return false;
293 if (!C->containsUndefOrPoisonElement())
294 return false;
295 auto *VecTy = dyn_cast<FixedVectorType>(C->getType());
296 if (!VecTy)
297 return false;
298 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
299 if (Constant *Elem = C->getAggregateElement(I))
300 if (!isa<UndefValue>(Elem))
301 return false;
302 }
303 return true;
304}
305
306/// Checks if the vector of instructions can be represented as a shuffle, like:
307/// %x0 = extractelement <4 x i8> %x, i32 0
308/// %x3 = extractelement <4 x i8> %x, i32 3
309/// %y1 = extractelement <4 x i8> %y, i32 1
310/// %y2 = extractelement <4 x i8> %y, i32 2
311/// %x0x0 = mul i8 %x0, %x0
312/// %x3x3 = mul i8 %x3, %x3
313/// %y1y1 = mul i8 %y1, %y1
314/// %y2y2 = mul i8 %y2, %y2
315/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
316/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
317/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
318/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
319/// ret <4 x i8> %ins4
320/// can be transformed into:
321/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
322/// i32 6>
323/// %2 = mul <4 x i8> %1, %1
324/// ret <4 x i8> %2
325/// We convert this initially to something like:
326/// %x0 = extractelement <4 x i8> %x, i32 0
327/// %x3 = extractelement <4 x i8> %x, i32 3
328/// %y1 = extractelement <4 x i8> %y, i32 1
329/// %y2 = extractelement <4 x i8> %y, i32 2
330/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
331/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
332/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
333/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
334/// %5 = mul <4 x i8> %4, %4
335/// %6 = extractelement <4 x i8> %5, i32 0
336/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
337/// %7 = extractelement <4 x i8> %5, i32 1
338/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
339/// %8 = extractelement <4 x i8> %5, i32 2
340/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
341/// %9 = extractelement <4 x i8> %5, i32 3
342/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
343/// ret <4 x i8> %ins4
344/// InstCombiner transforms this into a shuffle and vector mul
345/// Mask will return the Shuffle Mask equivalent to the extracted elements.
346/// TODO: Can we split off and reuse the shuffle mask detection from
347/// TargetTransformInfo::getInstructionThroughput?
348static Optional<TargetTransformInfo::ShuffleKind>
349isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
350 const auto *It =
351 find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
352 if (It == VL.end())
353 return None;
354 auto *EI0 = cast<ExtractElementInst>(*It);
355 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
356 return None;
357 unsigned Size =
358 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
359 Value *Vec1 = nullptr;
360 Value *Vec2 = nullptr;
361 enum ShuffleMode { Unknown, Select, Permute };
362 ShuffleMode CommonShuffleMode = Unknown;
363 Mask.assign(VL.size(), UndefMaskElem);
364 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
365 // Undef can be represented as an undef element in a vector.
366 if (isa<UndefValue>(VL[I]))
367 continue;
368 auto *EI = cast<ExtractElementInst>(VL[I]);
369 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
370 return None;
371 auto *Vec = EI->getVectorOperand();
372 // We can extractelement from undef or poison vector.
373 if (isUndefVector(Vec))
374 continue;
375 // All vector operands must have the same number of vector elements.
376 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
377 return None;
378 if (isa<UndefValue>(EI->getIndexOperand()))
379 continue;
380 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
381 if (!Idx)
382 return None;
383 // Undefined behavior if Idx is negative or >= Size.
384 if (Idx->getValue().uge(Size))
385 continue;
386 unsigned IntIdx = Idx->getValue().getZExtValue();
387 Mask[I] = IntIdx;
388 // For correct shuffling we have to have at most 2 different vector operands
389 // in all extractelement instructions.
390 if (!Vec1 || Vec1 == Vec) {
391 Vec1 = Vec;
392 } else if (!Vec2 || Vec2 == Vec) {
393 Vec2 = Vec;
394 Mask[I] += Size;
395 } else {
396 return None;
397 }
398 if (CommonShuffleMode == Permute)
399 continue;
400 // If the extract index is not the same as the operation number, it is a
401 // permutation.
402 if (IntIdx != I) {
403 CommonShuffleMode = Permute;
404 continue;
405 }
406 CommonShuffleMode = Select;
407 }
408 // If we're not crossing lanes in different vectors, consider it as blending.
409 if (CommonShuffleMode == Select && Vec2)
410 return TargetTransformInfo::SK_Select;
411 // If Vec2 was never used, we have a permutation of a single vector, otherwise
412 // we have permutation of 2 vectors.
413 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
414 : TargetTransformInfo::SK_PermuteSingleSrc;
415}
416
417namespace {
418
419/// Main data required for vectorization of instructions.
420struct InstructionsState {
421 /// The very first instruction in the list with the main opcode.
422 Value *OpValue = nullptr;
423
424 /// The main/alternate instruction.
425 Instruction *MainOp = nullptr;
426 Instruction *AltOp = nullptr;
427
428 /// The main/alternate opcodes for the list of instructions.
429 unsigned getOpcode() const {
430 return MainOp ? MainOp->getOpcode() : 0;
431 }
432
433 unsigned getAltOpcode() const {
434 return AltOp ? AltOp->getOpcode() : 0;
435 }
436
437 /// Some of the instructions in the list have alternate opcodes.
438 bool isAltShuffle() const { return AltOp != MainOp; }
439
440 bool isOpcodeOrAlt(Instruction *I) const {
441 unsigned CheckedOpcode = I->getOpcode();
442 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
443 }
444
445 InstructionsState() = delete;
446 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
447 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
448};
449
450} // end anonymous namespace
451
452/// Chooses the correct key for scheduling data. If \p Op has the same (or
453/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
454/// OpValue.
455static Value *isOneOf(const InstructionsState &S, Value *Op) {
456 auto *I = dyn_cast<Instruction>(Op);
457 if (I && S.isOpcodeOrAlt(I))
458 return Op;
459 return S.OpValue;
460}
461
462/// \returns true if \p Opcode is allowed as part of of the main/alternate
463/// instruction for SLP vectorization.
464///
465/// Example of unsupported opcode is SDIV that can potentially cause UB if the
466/// "shuffled out" lane would result in division by zero.
467static bool isValidForAlternation(unsigned Opcode) {
468 if (Instruction::isIntDivRem(Opcode))
469 return false;
470
471 return true;
472}
473
474/// \returns analysis of the Instructions in \p VL described in
475/// InstructionsState, the Opcode that we suppose the whole list
476/// could be vectorized even if its structure is diverse.
477static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
478 unsigned BaseIndex = 0) {
479 // Make sure these are all Instructions.
480 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
481 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
482
483 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
484 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
485 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
486 unsigned AltOpcode = Opcode;
487 unsigned AltIndex = BaseIndex;
488
489 // Check for one alternate opcode from another BinaryOperator.
490 // TODO - generalize to support all operators (types, calls etc.).
491 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
492 unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
493 if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
494 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
495 continue;
496 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
497 isValidForAlternation(Opcode)) {
498 AltOpcode = InstOpcode;
499 AltIndex = Cnt;
500 continue;
501 }
502 } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
503 Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
504 Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
505 if (Ty0 == Ty1) {
506 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
507 continue;
508 if (Opcode == AltOpcode) {
509 assert(isValidForAlternation(Opcode) &&(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 511, __extension__
__PRETTY_FUNCTION__))
510 isValidForAlternation(InstOpcode) &&(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 511, __extension__
__PRETTY_FUNCTION__))
511 "Cast isn't safe for alternation, logic needs to be updated!")(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 511, __extension__
__PRETTY_FUNCTION__))
;
512 AltOpcode = InstOpcode;
513 AltIndex = Cnt;
514 continue;
515 }
516 }
517 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
518 continue;
519 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
520 }
521
522 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
523 cast<Instruction>(VL[AltIndex]));
524}
525
526/// \returns true if all of the values in \p VL have the same type or false
527/// otherwise.
528static bool allSameType(ArrayRef<Value *> VL) {
529 Type *Ty = VL[0]->getType();
530 for (int i = 1, e = VL.size(); i < e; i++)
531 if (VL[i]->getType() != Ty)
532 return false;
533
534 return true;
535}
536
537/// \returns True if Extract{Value,Element} instruction extracts element Idx.
538static Optional<unsigned> getExtractIndex(Instruction *E) {
539 unsigned Opcode = E->getOpcode();
540 assert((Opcode == Instruction::ExtractElement ||(static_cast <bool> ((Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? void (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 542, __extension__
__PRETTY_FUNCTION__))
541 Opcode == Instruction::ExtractValue) &&(static_cast <bool> ((Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? void (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 542, __extension__
__PRETTY_FUNCTION__))
542 "Expected extractelement or extractvalue instruction.")(static_cast <bool> ((Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? void (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 542, __extension__
__PRETTY_FUNCTION__))
;
543 if (Opcode == Instruction::ExtractElement) {
544 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
545 if (!CI)
546 return None;
547 return CI->getZExtValue();
548 }
549 ExtractValueInst *EI = cast<ExtractValueInst>(E);
550 if (EI->getNumIndices() != 1)
551 return None;
552 return *EI->idx_begin();
553}
554
555/// \returns True if in-tree use also needs extract. This refers to
556/// possible scalar operand in vectorized instruction.
557static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
558 TargetLibraryInfo *TLI) {
559 unsigned Opcode = UserInst->getOpcode();
560 switch (Opcode) {
561 case Instruction::Load: {
562 LoadInst *LI = cast<LoadInst>(UserInst);
563 return (LI->getPointerOperand() == Scalar);
564 }
565 case Instruction::Store: {
566 StoreInst *SI = cast<StoreInst>(UserInst);
567 return (SI->getPointerOperand() == Scalar);
568 }
569 case Instruction::Call: {
570 CallInst *CI = cast<CallInst>(UserInst);
571 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
572 for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
573 if (hasVectorInstrinsicScalarOpd(ID, i))
574 return (CI->getArgOperand(i) == Scalar);
575 }
576 LLVM_FALLTHROUGH[[gnu::fallthrough]];
577 }
578 default:
579 return false;
580 }
581}
582
583/// \returns the AA location that is being access by the instruction.
584static MemoryLocation getLocation(Instruction *I, AAResults *AA) {
585 if (StoreInst *SI = dyn_cast<StoreInst>(I))
586 return MemoryLocation::get(SI);
587 if (LoadInst *LI = dyn_cast<LoadInst>(I))
588 return MemoryLocation::get(LI);
589 return MemoryLocation();
590}
591
592/// \returns True if the instruction is not a volatile or atomic load/store.
593static bool isSimple(Instruction *I) {
594 if (LoadInst *LI = dyn_cast<LoadInst>(I))
595 return LI->isSimple();
596 if (StoreInst *SI = dyn_cast<StoreInst>(I))
597 return SI->isSimple();
598 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
599 return !MI->isVolatile();
600 return true;
601}
602
603/// Shuffles \p Mask in accordance with the given \p SubMask.
604static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
605 if (SubMask.empty())
606 return;
607 if (Mask.empty()) {
608 Mask.append(SubMask.begin(), SubMask.end());
609 return;
610 }
611 SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
612 int TermValue = std::min(Mask.size(), SubMask.size());
613 for (int I = 0, E = SubMask.size(); I < E; ++I) {
614 if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
615 Mask[SubMask[I]] >= TermValue)
616 continue;
617 NewMask[I] = Mask[SubMask[I]];
618 }
619 Mask.swap(NewMask);
620}
621
622/// Order may have elements assigned special value (size) which is out of
623/// bounds. Such indices only appear on places which correspond to undef values
624/// (see canReuseExtract for details) and used in order to avoid undef values
625/// have effect on operands ordering.
626/// The first loop below simply finds all unused indices and then the next loop
627/// nest assigns these indices for undef values positions.
628/// As an example below Order has two undef positions and they have assigned
629/// values 3 and 7 respectively:
630/// before: 6 9 5 4 9 2 1 0
631/// after: 6 3 5 4 7 2 1 0
632static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {
633 const unsigned Sz = Order.size();
634 SmallBitVector UnusedIndices(Sz, /*t=*/true);
635 SmallBitVector MaskedIndices(Sz);
636 for (unsigned I = 0; I < Sz; ++I) {
637 if (Order[I] < Sz)
638 UnusedIndices.reset(Order[I]);
639 else
640 MaskedIndices.set(I);
641 }
642 if (MaskedIndices.none())
643 return;
644 assert(UnusedIndices.count() == MaskedIndices.count() &&(static_cast <bool> (UnusedIndices.count() == MaskedIndices
.count() && "Non-synced masked/available indices.") ?
void (0) : __assert_fail ("UnusedIndices.count() == MaskedIndices.count() && \"Non-synced masked/available indices.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 645, __extension__
__PRETTY_FUNCTION__))
645 "Non-synced masked/available indices.")(static_cast <bool> (UnusedIndices.count() == MaskedIndices
.count() && "Non-synced masked/available indices.") ?
void (0) : __assert_fail ("UnusedIndices.count() == MaskedIndices.count() && \"Non-synced masked/available indices.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 645, __extension__
__PRETTY_FUNCTION__))
;
646 int Idx = UnusedIndices.find_first();
647 int MIdx = MaskedIndices.find_first();
648 while (MIdx >= 0) {
649 assert(Idx >= 0 && "Indices must be synced.")(static_cast <bool> (Idx >= 0 && "Indices must be synced."
) ? void (0) : __assert_fail ("Idx >= 0 && \"Indices must be synced.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 649, __extension__
__PRETTY_FUNCTION__))
;
650 Order[MIdx] = Idx;
651 Idx = UnusedIndices.find_next(Idx);
652 MIdx = MaskedIndices.find_next(MIdx);
653 }
654}
655
656namespace llvm {
657
658static void inversePermutation(ArrayRef<unsigned> Indices,
659 SmallVectorImpl<int> &Mask) {
660 Mask.clear();
661 const unsigned E = Indices.size();
662 Mask.resize(E, UndefMaskElem);
663 for (unsigned I = 0; I < E; ++I)
664 Mask[Indices[I]] = I;
665}
666
667/// \returns inserting index of InsertElement or InsertValue instruction,
668/// using Offset as base offset for index.
669static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) {
670 int Index = Offset;
671 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
672 if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
673 auto *VT = cast<FixedVectorType>(IE->getType());
674 if (CI->getValue().uge(VT->getNumElements()))
675 return UndefMaskElem;
676 Index *= VT->getNumElements();
677 Index += CI->getZExtValue();
678 return Index;
679 }
680 if (isa<UndefValue>(IE->getOperand(2)))
681 return UndefMaskElem;
682 return None;
683 }
684
685 auto *IV = cast<InsertValueInst>(InsertInst);
686 Type *CurrentType = IV->getType();
687 for (unsigned I : IV->indices()) {
688 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
689 Index *= ST->getNumElements();
690 CurrentType = ST->getElementType(I);
691 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
692 Index *= AT->getNumElements();
693 CurrentType = AT->getElementType();
694 } else {
695 return None;
696 }
697 Index += I;
698 }
699 return Index;
700}
701
702/// Reorders the list of scalars in accordance with the given \p Order and then
703/// the \p Mask. \p Order - is the original order of the scalars, need to
704/// reorder scalars into an unordered state at first according to the given
705/// order. Then the ordered scalars are shuffled once again in accordance with
706/// the provided mask.
707static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
708 ArrayRef<int> Mask) {
709 assert(!Mask.empty() && "Expected non-empty mask.")(static_cast <bool> (!Mask.empty() && "Expected non-empty mask."
) ? void (0) : __assert_fail ("!Mask.empty() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 709, __extension__
__PRETTY_FUNCTION__))
;
710 SmallVector<Value *> Prev(Scalars.size(),
711 UndefValue::get(Scalars.front()->getType()));
712 Prev.swap(Scalars);
713 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
714 if (Mask[I] != UndefMaskElem)
715 Scalars[Mask[I]] = Prev[I];
716}
717
718namespace slpvectorizer {
719
720/// Bottom Up SLP Vectorizer.
721class BoUpSLP {
722 struct TreeEntry;
723 struct ScheduleData;
724
725public:
726 using ValueList = SmallVector<Value *, 8>;
727 using InstrList = SmallVector<Instruction *, 16>;
728 using ValueSet = SmallPtrSet<Value *, 16>;
729 using StoreList = SmallVector<StoreInst *, 8>;
730 using ExtraValueToDebugLocsMap =
731 MapVector<Value *, SmallVector<Instruction *, 2>>;
732 using OrdersType = SmallVector<unsigned, 4>;
733
734 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
735 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
736 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
737 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
738 : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
739 DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
740 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
741 // Use the vector register size specified by the target unless overridden
742 // by a command-line option.
743 // TODO: It would be better to limit the vectorization factor based on
744 // data type rather than just register size. For example, x86 AVX has
745 // 256-bit registers, but it does not support integer operations
746 // at that width (that requires AVX2).
747 if (MaxVectorRegSizeOption.getNumOccurrences())
748 MaxVecRegSize = MaxVectorRegSizeOption;
749 else
750 MaxVecRegSize =
751 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
752 .getFixedSize();
753
754 if (MinVectorRegSizeOption.getNumOccurrences())
755 MinVecRegSize = MinVectorRegSizeOption;
756 else
757 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
758 }
759
760 /// Vectorize the tree that starts with the elements in \p VL.
761 /// Returns the vectorized root.
762 Value *vectorizeTree();
763
764 /// Vectorize the tree but with the list of externally used values \p
765 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
766 /// generated extractvalue instructions.
767 Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
768
769 /// \returns the cost incurred by unwanted spills and fills, caused by
770 /// holding live values over call sites.
771 InstructionCost getSpillCost() const;
772
773 /// \returns the vectorization cost of the subtree that starts at \p VL.
774 /// A negative number means that this is profitable.
775 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None);
776
777 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
778 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
779 void buildTree(ArrayRef<Value *> Roots,
780 ArrayRef<Value *> UserIgnoreLst = None);
781
782 /// Builds external uses of the vectorized scalars, i.e. the list of
783 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
784 /// ExternallyUsedValues contains additional list of external uses to handle
785 /// vectorization of reductions.
786 void
787 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
788
789 /// Clear the internal data structures that are created by 'buildTree'.
790 void deleteTree() {
791 VectorizableTree.clear();
792 ScalarToTreeEntry.clear();
793 MustGather.clear();
794 ExternalUses.clear();
795 for (auto &Iter : BlocksSchedules) {
796 BlockScheduling *BS = Iter.second.get();
797 BS->clear();
798 }
799 MinBWs.clear();
800 InstrElementSize.clear();
801 }
802
803 unsigned getTreeSize() const { return VectorizableTree.size(); }
804
805 /// Perform LICM and CSE on the newly generated gather sequences.
806 void optimizeGatherSequence();
807
808 /// Checks if the specified gather tree entry \p TE can be represented as a
809 /// shuffled vector entry + (possibly) permutation with other gathers. It
810 /// implements the checks only for possibly ordered scalars (Loads,
811 /// ExtractElement, ExtractValue), which can be part of the graph.
812 Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
813
814 /// Gets reordering data for the given tree entry. If the entry is vectorized
815 /// - just return ReorderIndices, otherwise check if the scalars can be
816 /// reordered and return the most optimal order.
817 /// \param TopToBottom If true, include the order of vectorized stores and
818 /// insertelement nodes, otherwise skip them.
819 Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);
820
821 /// Reorders the current graph to the most profitable order starting from the
822 /// root node to the leaf nodes. The best order is chosen only from the nodes
823 /// of the same size (vectorization factor). Smaller nodes are considered
824 /// parts of subgraph with smaller VF and they are reordered independently. We
825 /// can make it because we still need to extend smaller nodes to the wider VF
826 /// and we can merge reordering shuffles with the widening shuffles.
827 void reorderTopToBottom();
828
829 /// Reorders the current graph to the most profitable order starting from
830 /// leaves to the root. It allows to rotate small subgraphs and reduce the
831 /// number of reshuffles if the leaf nodes use the same order. In this case we
832 /// can merge the orders and just shuffle user node instead of shuffling its
833 /// operands. Plus, even the leaf nodes have different orders, it allows to
834 /// sink reordering in the graph closer to the root node and merge it later
835 /// during analysis.
836 void reorderBottomToTop(bool IgnoreReorder = false);
837
838 /// \return The vector element size in bits to use when vectorizing the
839 /// expression tree ending at \p V. If V is a store, the size is the width of
840 /// the stored value. Otherwise, the size is the width of the largest loaded
841 /// value reaching V. This method is used by the vectorizer to calculate
842 /// vectorization factors.
843 unsigned getVectorElementSize(Value *V);
844
845 /// Compute the minimum type sizes required to represent the entries in a
846 /// vectorizable tree.
847 void computeMinimumValueSizes();
848
849 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
850 unsigned getMaxVecRegSize() const {
851 return MaxVecRegSize;
852 }
853
854 // \returns minimum vector register size as set by cl::opt.
855 unsigned getMinVecRegSize() const {
856 return MinVecRegSize;
857 }
858
859 unsigned getMinVF(unsigned Sz) const {
860 return std::max(2U, getMinVecRegSize() / Sz);
861 }
862
863 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
864 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
865 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
866 return MaxVF ? MaxVF : UINT_MAX(2147483647 *2U +1U);
867 }
868
869 /// Check if homogeneous aggregate is isomorphic to some VectorType.
870 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
871 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
872 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
873 ///
874 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
875 unsigned canMapToVector(Type *T, const DataLayout &DL) const;
876
877 /// \returns True if the VectorizableTree is both tiny and not fully
878 /// vectorizable. We do not vectorize such trees.
879 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
880
881 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
882 /// can be load combined in the backend. Load combining may not be allowed in
883 /// the IR optimizer, so we do not want to alter the pattern. For example,
884 /// partially transforming a scalar bswap() pattern into vector code is
885 /// effectively impossible for the backend to undo.
886 /// TODO: If load combining is allowed in the IR optimizer, this analysis
887 /// may not be necessary.
888 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
889
890 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
891 /// can be load combined in the backend. Load combining may not be allowed in
892 /// the IR optimizer, so we do not want to alter the pattern. For example,
893 /// partially transforming a scalar bswap() pattern into vector code is
894 /// effectively impossible for the backend to undo.
895 /// TODO: If load combining is allowed in the IR optimizer, this analysis
896 /// may not be necessary.
897 bool isLoadCombineCandidate() const;
898
899 OptimizationRemarkEmitter *getORE() { return ORE; }
900
901 /// This structure holds any data we need about the edges being traversed
902 /// during buildTree_rec(). We keep track of:
903 /// (i) the user TreeEntry index, and
904 /// (ii) the index of the edge.
905 struct EdgeInfo {
906 EdgeInfo() = default;
907 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
908 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
909 /// The user TreeEntry.
910 TreeEntry *UserTE = nullptr;
911 /// The operand index of the use.
912 unsigned EdgeIdx = UINT_MAX(2147483647 *2U +1U);
913#ifndef NDEBUG
914 friend inline raw_ostream &operator<<(raw_ostream &OS,
915 const BoUpSLP::EdgeInfo &EI) {
916 EI.dump(OS);
917 return OS;
918 }
919 /// Debug print.
920 void dump(raw_ostream &OS) const {
921 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
922 << " EdgeIdx:" << EdgeIdx << "}";
923 }
924 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { dump(dbgs()); }
925#endif
926 };
927
928 /// A helper data structure to hold the operands of a vector of instructions.
929 /// This supports a fixed vector length for all operand vectors.
930 class VLOperands {
931 /// For each operand we need (i) the value, and (ii) the opcode that it
932 /// would be attached to if the expression was in a left-linearized form.
933 /// This is required to avoid illegal operand reordering.
934 /// For example:
935 /// \verbatim
936 /// 0 Op1
937 /// |/
938 /// Op1 Op2 Linearized + Op2
939 /// \ / ----------> |/
940 /// - -
941 ///
942 /// Op1 - Op2 (0 + Op1) - Op2
943 /// \endverbatim
944 ///
945 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
946 ///
947 /// Another way to think of this is to track all the operations across the
948 /// path from the operand all the way to the root of the tree and to
949 /// calculate the operation that corresponds to this path. For example, the
950 /// path from Op2 to the root crosses the RHS of the '-', therefore the
951 /// corresponding operation is a '-' (which matches the one in the
952 /// linearized tree, as shown above).
953 ///
954 /// For lack of a better term, we refer to this operation as Accumulated
955 /// Path Operation (APO).
956 struct OperandData {
957 OperandData() = default;
958 OperandData(Value *V, bool APO, bool IsUsed)
959 : V(V), APO(APO), IsUsed(IsUsed) {}
960 /// The operand value.
961 Value *V = nullptr;
962 /// TreeEntries only allow a single opcode, or an alternate sequence of
963 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
964 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
965 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
966 /// (e.g., Add/Mul)
967 bool APO = false;
968 /// Helper data for the reordering function.
969 bool IsUsed = false;
970 };
971
972 /// During operand reordering, we are trying to select the operand at lane
973 /// that matches best with the operand at the neighboring lane. Our
974 /// selection is based on the type of value we are looking for. For example,
975 /// if the neighboring lane has a load, we need to look for a load that is
976 /// accessing a consecutive address. These strategies are summarized in the
977 /// 'ReorderingMode' enumerator.
978 enum class ReorderingMode {
979 Load, ///< Matching loads to consecutive memory addresses
980 Opcode, ///< Matching instructions based on opcode (same or alternate)
981 Constant, ///< Matching constants
982 Splat, ///< Matching the same instruction multiple times (broadcast)
983 Failed, ///< We failed to create a vectorizable group
984 };
985
986 using OperandDataVec = SmallVector<OperandData, 2>;
987
988 /// A vector of operand vectors.
989 SmallVector<OperandDataVec, 4> OpsVec;
990
991 const DataLayout &DL;
992 ScalarEvolution &SE;
993 const BoUpSLP &R;
994
995 /// \returns the operand data at \p OpIdx and \p Lane.
996 OperandData &getData(unsigned OpIdx, unsigned Lane) {
997 return OpsVec[OpIdx][Lane];
998 }
999
1000 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1001 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1002 return OpsVec[OpIdx][Lane];
1003 }
1004
1005 /// Clears the used flag for all entries.
1006 void clearUsed() {
1007 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1008 OpIdx != NumOperands; ++OpIdx)
1009 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1010 ++Lane)
1011 OpsVec[OpIdx][Lane].IsUsed = false;
1012 }
1013
1014 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1015 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1016 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1017 }
1018
1019 // The hard-coded scores listed here are not very important, though it shall
1020 // be higher for better matches to improve the resulting cost. When
1021 // computing the scores of matching one sub-tree with another, we are
1022 // basically counting the number of values that are matching. So even if all
1023 // scores are set to 1, we would still get a decent matching result.
1024 // However, sometimes we have to break ties. For example we may have to
1025 // choose between matching loads vs matching opcodes. This is what these
1026 // scores are helping us with: they provide the order of preference. Also,
1027 // this is important if the scalar is externally used or used in another
1028 // tree entry node in the different lane.
1029
1030 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1031 static const int ScoreConsecutiveLoads = 4;
1032 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1033 static const int ScoreReversedLoads = 3;
1034 /// ExtractElementInst from same vector and consecutive indexes.
1035 static const int ScoreConsecutiveExtracts = 4;
1036 /// ExtractElementInst from same vector and reversed indices.
1037 static const int ScoreReversedExtracts = 3;
1038 /// Constants.
1039 static const int ScoreConstants = 2;
1040 /// Instructions with the same opcode.
1041 static const int ScoreSameOpcode = 2;
1042 /// Instructions with alt opcodes (e.g, add + sub).
1043 static const int ScoreAltOpcodes = 1;
1044 /// Identical instructions (a.k.a. splat or broadcast).
1045 static const int ScoreSplat = 1;
1046 /// Matching with an undef is preferable to failing.
1047 static const int ScoreUndef = 1;
1048 /// Score for failing to find a decent match.
1049 static const int ScoreFail = 0;
1050 /// User exteranl to the vectorized code.
1051 static const int ExternalUseCost = 1;
1052 /// The user is internal but in a different lane.
1053 static const int UserInDiffLaneCost = ExternalUseCost;
1054
1055 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1056 static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
1057 ScalarEvolution &SE, int NumLanes) {
1058 if (V1 == V2)
1059 return VLOperands::ScoreSplat;
1060
1061 auto *LI1 = dyn_cast<LoadInst>(V1);
1062 auto *LI2 = dyn_cast<LoadInst>(V2);
1063 if (LI1 && LI2) {
1064 if (LI1->getParent() != LI2->getParent())
1065 return VLOperands::ScoreFail;
1066
1067 Optional<int> Dist = getPointersDiff(
1068 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1069 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1070 if (!Dist)
1071 return VLOperands::ScoreFail;
1072 // The distance is too large - still may be profitable to use masked
1073 // loads/gathers.
1074 if (std::abs(*Dist) > NumLanes / 2)
1075 return VLOperands::ScoreAltOpcodes;
1076 // This still will detect consecutive loads, but we might have "holes"
1077 // in some cases. It is ok for non-power-2 vectorization and may produce
1078 // better results. It should not affect current vectorization.
1079 return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads
1080 : VLOperands::ScoreReversedLoads;
1081 }
1082
1083 auto *C1 = dyn_cast<Constant>(V1);
1084 auto *C2 = dyn_cast<Constant>(V2);
1085 if (C1 && C2)
1086 return VLOperands::ScoreConstants;
1087
1088 // Extracts from consecutive indexes of the same vector better score as
1089 // the extracts could be optimized away.
1090 Value *EV1;
1091 ConstantInt *Ex1Idx;
1092 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1093 // Undefs are always profitable for extractelements.
1094 if (isa<UndefValue>(V2))
1095 return VLOperands::ScoreConsecutiveExtracts;
1096 Value *EV2 = nullptr;
1097 ConstantInt *Ex2Idx = nullptr;
1098 if (match(V2,
1099 m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
1100 m_Undef())))) {
1101 // Undefs are always profitable for extractelements.
1102 if (!Ex2Idx)
1103 return VLOperands::ScoreConsecutiveExtracts;
1104 if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
1105 return VLOperands::ScoreConsecutiveExtracts;
1106 if (EV2 == EV1) {
1107 int Idx1 = Ex1Idx->getZExtValue();
1108 int Idx2 = Ex2Idx->getZExtValue();
1109 int Dist = Idx2 - Idx1;
1110 // The distance is too large - still may be profitable to use
1111 // shuffles.
1112 if (std::abs(Dist) > NumLanes / 2)
1113 return VLOperands::ScoreAltOpcodes;
1114 return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts
1115 : VLOperands::ScoreReversedExtracts;
1116 }
1117 }
1118 }
1119
1120 auto *I1 = dyn_cast<Instruction>(V1);
1121 auto *I2 = dyn_cast<Instruction>(V2);
1122 if (I1 && I2) {
1123 if (I1->getParent() != I2->getParent())
1124 return VLOperands::ScoreFail;
1125 InstructionsState S = getSameOpcode({I1, I2});
1126 // Note: Only consider instructions with <= 2 operands to avoid
1127 // complexity explosion.
1128 if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
1129 return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
1130 : VLOperands::ScoreSameOpcode;
1131 }
1132
1133 if (isa<UndefValue>(V2))
1134 return VLOperands::ScoreUndef;
1135
1136 return VLOperands::ScoreFail;
1137 }
1138
1139 /// Holds the values and their lanes that are taking part in the look-ahead
1140 /// score calculation. This is used in the external uses cost calculation.
1141 /// Need to hold all the lanes in case of splat/broadcast at least to
1142 /// correctly check for the use in the different lane.
1143 SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;
1144
1145 /// \returns the additional cost due to uses of \p LHS and \p RHS that are
1146 /// either external to the vectorized code, or require shuffling.
1147 int getExternalUsesCost(const std::pair<Value *, int> &LHS,
1148 const std::pair<Value *, int> &RHS) {
1149 int Cost = 0;
1150 std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
1151 for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
1152 Value *V = Values[Idx].first;
1153 if (isa<Constant>(V)) {
1154 // Since this is a function pass, it doesn't make semantic sense to
1155 // walk the users of a subclass of Constant. The users could be in
1156 // another function, or even another module that happens to be in
1157 // the same LLVMContext.
1158 continue;
1159 }
1160
1161 // Calculate the absolute lane, using the minimum relative lane of LHS
1162 // and RHS as base and Idx as the offset.
1163 int Ln = std::min(LHS.second, RHS.second) + Idx;
1164 assert(Ln >= 0 && "Bad lane calculation")(static_cast <bool> (Ln >= 0 && "Bad lane calculation"
) ? void (0) : __assert_fail ("Ln >= 0 && \"Bad lane calculation\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1164, __extension__
__PRETTY_FUNCTION__))
;
1165 unsigned UsersBudget = LookAheadUsersBudget;
1166 for (User *U : V->users()) {
1167 if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
1168 // The user is in the VectorizableTree. Check if we need to insert.
1169 int UserLn = UserTE->findLaneForValue(U);
1170 assert(UserLn >= 0 && "Bad lane")(static_cast <bool> (UserLn >= 0 && "Bad lane"
) ? void (0) : __assert_fail ("UserLn >= 0 && \"Bad lane\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1170, __extension__
__PRETTY_FUNCTION__))
;
1171 // If the values are different, check just the line of the current
1172 // value. If the values are the same, need to add UserInDiffLaneCost
1173 // only if UserLn does not match both line numbers.
1174 if ((LHS.first != RHS.first && UserLn != Ln) ||
1175 (LHS.first == RHS.first && UserLn != LHS.second &&
1176 UserLn != RHS.second)) {
1177 Cost += UserInDiffLaneCost;
1178 break;
1179 }
1180 } else {
1181 // Check if the user is in the look-ahead code.
1182 auto It2 = InLookAheadValues.find(U);
1183 if (It2 != InLookAheadValues.end()) {
1184 // The user is in the look-ahead code. Check the lane.
1185 if (!It2->getSecond().contains(Ln)) {
1186 Cost += UserInDiffLaneCost;
1187 break;
1188 }
1189 } else {
1190 // The user is neither in SLP tree nor in the look-ahead code.
1191 Cost += ExternalUseCost;
1192 break;
1193 }
1194 }
1195 // Limit the number of visited uses to cap compilation time.
1196 if (--UsersBudget == 0)
1197 break;
1198 }
1199 }
1200 return Cost;
1201 }
1202
1203 /// Go through the operands of \p LHS and \p RHS recursively until \p
1204 /// MaxLevel, and return the cummulative score. For example:
1205 /// \verbatim
1206 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1207 /// \ / \ / \ / \ /
1208 /// + + + +
1209 /// G1 G2 G3 G4
1210 /// \endverbatim
1211 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1212 /// each level recursively, accumulating the score. It starts from matching
1213 /// the additions at level 0, then moves on to the loads (level 1). The
1214 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1215 /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
1216 /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
1217 /// Please note that the order of the operands does not matter, as we
1218 /// evaluate the score of all profitable combinations of operands. In
1219 /// other words the score of G1 and G4 is the same as G1 and G2. This
1220 /// heuristic is based on ideas described in:
1221 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1222 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1223 /// Luís F. W. Góes
1224 int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
1225 const std::pair<Value *, int> &RHS, int CurrLevel,
1226 int MaxLevel) {
1227
1228 Value *V1 = LHS.first;
1229 Value *V2 = RHS.first;
1230 // Get the shallow score of V1 and V2.
1231 int ShallowScoreAtThisLevel = std::max(
1232 (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -
1233 getExternalUsesCost(LHS, RHS));
1234 int Lane1 = LHS.second;
1235 int Lane2 = RHS.second;
1236
1237 // If reached MaxLevel,
1238 // or if V1 and V2 are not instructions,
1239 // or if they are SPLAT,
1240 // or if they are not consecutive,
1241 // or if profitable to vectorize loads or extractelements, early return
1242 // the current cost.
1243 auto *I1 = dyn_cast<Instruction>(V1);
1244 auto *I2 = dyn_cast<Instruction>(V2);
1245 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1246 ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
1247 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1248 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1249 ShallowScoreAtThisLevel))
1250 return ShallowScoreAtThisLevel;
1251 assert(I1 && I2 && "Should have early exited.")(static_cast <bool> (I1 && I2 && "Should have early exited."
) ? void (0) : __assert_fail ("I1 && I2 && \"Should have early exited.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1251, __extension__
__PRETTY_FUNCTION__))
;
1252
1253 // Keep track of in-tree values for determining the external-use cost.
1254 InLookAheadValues[V1].insert(Lane1);
1255 InLookAheadValues[V2].insert(Lane2);
1256
1257 // Contains the I2 operand indexes that got matched with I1 operands.
1258 SmallSet<unsigned, 4> Op2Used;
1259
1260 // Recursion towards the operands of I1 and I2. We are trying all possible
1261 // operand pairs, and keeping track of the best score.
1262 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1263 OpIdx1 != NumOperands1; ++OpIdx1) {
1264 // Try to pair op1I with the best operand of I2.
1265 int MaxTmpScore = 0;
1266 unsigned MaxOpIdx2 = 0;
1267 bool FoundBest = false;
1268 // If I2 is commutative try all combinations.
1269 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1270 unsigned ToIdx = isCommutative(I2)
1271 ? I2->getNumOperands()
1272 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1273 assert(FromIdx <= ToIdx && "Bad index")(static_cast <bool> (FromIdx <= ToIdx && "Bad index"
) ? void (0) : __assert_fail ("FromIdx <= ToIdx && \"Bad index\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1273, __extension__
__PRETTY_FUNCTION__))
;
1274 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1275 // Skip operands already paired with OpIdx1.
1276 if (Op2Used.count(OpIdx2))
1277 continue;
1278 // Recursively calculate the cost at each level
1279 int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
1280 {I2->getOperand(OpIdx2), Lane2},
1281 CurrLevel + 1, MaxLevel);
1282 // Look for the best score.
1283 if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
1284 MaxTmpScore = TmpScore;
1285 MaxOpIdx2 = OpIdx2;
1286 FoundBest = true;
1287 }
1288 }
1289 if (FoundBest) {
1290 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1291 Op2Used.insert(MaxOpIdx2);
1292 ShallowScoreAtThisLevel += MaxTmpScore;
1293 }
1294 }
1295 return ShallowScoreAtThisLevel;
1296 }
1297
1298 /// \Returns the look-ahead score, which tells us how much the sub-trees
1299 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1300 /// score. This helps break ties in an informed way when we cannot decide on
1301 /// the order of the operands by just considering the immediate
1302 /// predecessors.
1303 int getLookAheadScore(const std::pair<Value *, int> &LHS,
1304 const std::pair<Value *, int> &RHS) {
1305 InLookAheadValues.clear();
1306 return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
1307 }
1308
1309 // Search all operands in Ops[*][Lane] for the one that matches best
1310 // Ops[OpIdx][LastLane] and return its opreand index.
1311 // If no good match can be found, return None.
1312 Optional<unsigned>
1313 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1314 ArrayRef<ReorderingMode> ReorderingModes) {
1315 unsigned NumOperands = getNumOperands();
1316
1317 // The operand of the previous lane at OpIdx.
1318 Value *OpLastLane = getData(OpIdx, LastLane).V;
1319
1320 // Our strategy mode for OpIdx.
1321 ReorderingMode RMode = ReorderingModes[OpIdx];
1322
1323 // The linearized opcode of the operand at OpIdx, Lane.
1324 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1325
1326 // The best operand index and its score.
1327 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1328 // are using the score to differentiate between the two.
1329 struct BestOpData {
1330 Optional<unsigned> Idx = None;
1331 unsigned Score = 0;
1332 } BestOp;
1333
1334 // Iterate through all unused operands and look for the best.
1335 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1336 // Get the operand at Idx and Lane.
1337 OperandData &OpData = getData(Idx, Lane);
1338 Value *Op = OpData.V;
1339 bool OpAPO = OpData.APO;
1340
1341 // Skip already selected operands.
1342 if (OpData.IsUsed)
1343 continue;
1344
1345 // Skip if we are trying to move the operand to a position with a
1346 // different opcode in the linearized tree form. This would break the
1347 // semantics.
1348 if (OpAPO != OpIdxAPO)
1349 continue;
1350
1351 // Look for an operand that matches the current mode.
1352 switch (RMode) {
1353 case ReorderingMode::Load:
1354 case ReorderingMode::Constant:
1355 case ReorderingMode::Opcode: {
1356 bool LeftToRight = Lane > LastLane;
1357 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1358 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1359 unsigned Score =
1360 getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
1361 if (Score > BestOp.Score) {
1362 BestOp.Idx = Idx;
1363 BestOp.Score = Score;
1364 }
1365 break;
1366 }
1367 case ReorderingMode::Splat:
1368 if (Op == OpLastLane)
1369 BestOp.Idx = Idx;
1370 break;
1371 case ReorderingMode::Failed:
1372 return None;
1373 }
1374 }
1375
1376 if (BestOp.Idx) {
1377 getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
1378 return BestOp.Idx;
1379 }
1380 // If we could not find a good match return None.
1381 return None;
1382 }
1383
1384 /// Helper for reorderOperandVecs.
1385 /// \returns the lane that we should start reordering from. This is the one
1386 /// which has the least number of operands that can freely move about or
1387 /// less profitable because it already has the most optimal set of operands.
1388 unsigned getBestLaneToStartReordering() const {
1389 unsigned Min = UINT_MAX(2147483647 *2U +1U);
1390 unsigned SameOpNumber = 0;
1391 // std::pair<unsigned, unsigned> is used to implement a simple voting
1392 // algorithm and choose the lane with the least number of operands that
1393 // can freely move about or less profitable because it already has the
1394 // most optimal set of operands. The first unsigned is a counter for
1395 // voting, the second unsigned is the counter of lanes with instructions
1396 // with same/alternate opcodes and same parent basic block.
1397 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
1398 // Try to be closer to the original results, if we have multiple lanes
1399 // with same cost. If 2 lanes have the same cost, use the one with the
1400 // lowest index.
1401 for (int I = getNumLanes(); I > 0; --I) {
1402 unsigned Lane = I - 1;
1403 OperandsOrderData NumFreeOpsHash =
1404 getMaxNumOperandsThatCanBeReordered(Lane);
1405 // Compare the number of operands that can move and choose the one with
1406 // the least number.
1407 if (NumFreeOpsHash.NumOfAPOs < Min) {
1408 Min = NumFreeOpsHash.NumOfAPOs;
1409 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1410 HashMap.clear();
1411 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1412 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1413 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1414 // Select the most optimal lane in terms of number of operands that
1415 // should be moved around.
1416 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1417 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1418 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1419 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1420 auto It = HashMap.find(NumFreeOpsHash.Hash);
1421 if (It == HashMap.end())
1422 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1423 else
1424 ++It->second.first;
1425 }
1426 }
1427 // Select the lane with the minimum counter.
1428 unsigned BestLane = 0;
1429 unsigned CntMin = UINT_MAX(2147483647 *2U +1U);
1430 for (const auto &Data : reverse(HashMap)) {
1431 if (Data.second.first < CntMin) {
1432 CntMin = Data.second.first;
1433 BestLane = Data.second.second;
1434 }
1435 }
1436 return BestLane;
1437 }
1438
1439 /// Data structure that helps to reorder operands.
1440 struct OperandsOrderData {
1441 /// The best number of operands with the same APOs, which can be
1442 /// reordered.
1443 unsigned NumOfAPOs = UINT_MAX(2147483647 *2U +1U);
1444 /// Number of operands with the same/alternate instruction opcode and
1445 /// parent.
1446 unsigned NumOpsWithSameOpcodeParent = 0;
1447 /// Hash for the actual operands ordering.
1448 /// Used to count operands, actually their position id and opcode
1449 /// value. It is used in the voting mechanism to find the lane with the
1450 /// least number of operands that can freely move about or less profitable
1451 /// because it already has the most optimal set of operands. Can be
1452 /// replaced with SmallVector<unsigned> instead but hash code is faster
1453 /// and requires less memory.
1454 unsigned Hash = 0;
1455 };
1456 /// \returns the maximum number of operands that are allowed to be reordered
1457 /// for \p Lane and the number of compatible instructions(with the same
1458 /// parent/opcode). This is used as a heuristic for selecting the first lane
1459 /// to start operand reordering.
1460 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1461 unsigned CntTrue = 0;
1462 unsigned NumOperands = getNumOperands();
1463 // Operands with the same APO can be reordered. We therefore need to count
1464 // how many of them we have for each APO, like this: Cnt[APO] = x.
1465 // Since we only have two APOs, namely true and false, we can avoid using
1466 // a map. Instead we can simply count the number of operands that
1467 // correspond to one of them (in this case the 'true' APO), and calculate
1468 // the other by subtracting it from the total number of operands.
1469 // Operands with the same instruction opcode and parent are more
1470 // profitable since we don't need to move them in many cases, with a high
1471 // probability such lane already can be vectorized effectively.
1472 bool AllUndefs = true;
1473 unsigned NumOpsWithSameOpcodeParent = 0;
1474 Instruction *OpcodeI = nullptr;
1475 BasicBlock *Parent = nullptr;
1476 unsigned Hash = 0;
1477 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1478 const OperandData &OpData = getData(OpIdx, Lane);
1479 if (OpData.APO)
1480 ++CntTrue;
1481 // Use Boyer-Moore majority voting for finding the majority opcode and
1482 // the number of times it occurs.
1483 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1484 if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||
1485 I->getParent() != Parent) {
1486 if (NumOpsWithSameOpcodeParent == 0) {
1487 NumOpsWithSameOpcodeParent = 1;
1488 OpcodeI = I;
1489 Parent = I->getParent();
1490 } else {
1491 --NumOpsWithSameOpcodeParent;
1492 }
1493 } else {
1494 ++NumOpsWithSameOpcodeParent;
1495 }
1496 }
1497 Hash = hash_combine(
1498 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1499 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1500 }
1501 if (AllUndefs)
1502 return {};
1503 OperandsOrderData Data;
1504 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1505 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1506 Data.Hash = Hash;
1507 return Data;
1508 }
1509
1510 /// Go through the instructions in VL and append their operands.
1511 void appendOperandsOfVL(ArrayRef<Value *> VL) {
1512 assert(!VL.empty() && "Bad VL")(static_cast <bool> (!VL.empty() && "Bad VL") ?
void (0) : __assert_fail ("!VL.empty() && \"Bad VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1512, __extension__
__PRETTY_FUNCTION__))
;
1513 assert((empty() || VL.size() == getNumLanes()) &&(static_cast <bool> ((empty() || VL.size() == getNumLanes
()) && "Expected same number of lanes") ? void (0) : __assert_fail
("(empty() || VL.size() == getNumLanes()) && \"Expected same number of lanes\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1514, __extension__
__PRETTY_FUNCTION__))
1514 "Expected same number of lanes")(static_cast <bool> ((empty() || VL.size() == getNumLanes
()) && "Expected same number of lanes") ? void (0) : __assert_fail
("(empty() || VL.size() == getNumLanes()) && \"Expected same number of lanes\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1514, __extension__
__PRETTY_FUNCTION__))
;
1515 assert(isa<Instruction>(VL[0]) && "Expected instruction")(static_cast <bool> (isa<Instruction>(VL[0]) &&
"Expected instruction") ? void (0) : __assert_fail ("isa<Instruction>(VL[0]) && \"Expected instruction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1515, __extension__
__PRETTY_FUNCTION__))
;
1516 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1517 OpsVec.resize(NumOperands);
1518 unsigned NumLanes = VL.size();
1519 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1520 OpsVec[OpIdx].resize(NumLanes);
1521 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1522 assert(isa<Instruction>(VL[Lane]) && "Expected instruction")(static_cast <bool> (isa<Instruction>(VL[Lane]) &&
"Expected instruction") ? void (0) : __assert_fail ("isa<Instruction>(VL[Lane]) && \"Expected instruction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1522, __extension__
__PRETTY_FUNCTION__))
;
1523 // Our tree has just 3 nodes: the root and two operands.
1524 // It is therefore trivial to get the APO. We only need to check the
1525 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
1526 // RHS operand. The LHS operand of both add and sub is never attached
1527 // to an inversese operation in the linearized form, therefore its APO
1528 // is false. The RHS is true only if VL[Lane] is an inverse operation.
1529
1530 // Since operand reordering is performed on groups of commutative
1531 // operations or alternating sequences (e.g., +, -), we can safely
1532 // tell the inverse operations by checking commutativity.
1533 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
1534 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
1535 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1536 APO, false};
1537 }
1538 }
1539 }
1540
1541 /// \returns the number of operands.
1542 unsigned getNumOperands() const { return OpsVec.size(); }
1543
1544 /// \returns the number of lanes.
1545 unsigned getNumLanes() const { return OpsVec[0].size(); }
1546
1547 /// \returns the operand value at \p OpIdx and \p Lane.
1548 Value *getValue(unsigned OpIdx, unsigned Lane) const {
1549 return getData(OpIdx, Lane).V;
1550 }
1551
1552 /// \returns true if the data structure is empty.
1553 bool empty() const { return OpsVec.empty(); }
1554
1555 /// Clears the data.
1556 void clear() { OpsVec.clear(); }
1557
1558 /// \Returns true if there are enough operands identical to \p Op to fill
1559 /// the whole vector.
1560 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
1561 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
1562 bool OpAPO = getData(OpIdx, Lane).APO;
1563 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1564 if (Ln == Lane)
1565 continue;
1566 // This is set to true if we found a candidate for broadcast at Lane.
1567 bool FoundCandidate = false;
1568 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1569 OperandData &Data = getData(OpI, Ln);
1570 if (Data.APO != OpAPO || Data.IsUsed)
1571 continue;
1572 if (Data.V == Op) {
1573 FoundCandidate = true;
1574 Data.IsUsed = true;
1575 break;
1576 }
1577 }
1578 if (!FoundCandidate)
1579 return false;
1580 }
1581 return true;
1582 }
1583
1584 public:
1585 /// Initialize with all the operands of the instruction vector \p RootVL.
1586 VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
1587 ScalarEvolution &SE, const BoUpSLP &R)
1588 : DL(DL), SE(SE), R(R) {
1589 // Append all the operands of RootVL.
1590 appendOperandsOfVL(RootVL);
1591 }
1592
1593 /// \Returns a value vector with the operands across all lanes for the
1594 /// opearnd at \p OpIdx.
1595 ValueList getVL(unsigned OpIdx) const {
1596 ValueList OpVL(OpsVec[OpIdx].size());
1597 assert(OpsVec[OpIdx].size() == getNumLanes() &&(static_cast <bool> (OpsVec[OpIdx].size() == getNumLanes
() && "Expected same num of lanes across all operands"
) ? void (0) : __assert_fail ("OpsVec[OpIdx].size() == getNumLanes() && \"Expected same num of lanes across all operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1598, __extension__
__PRETTY_FUNCTION__))
1598 "Expected same num of lanes across all operands")(static_cast <bool> (OpsVec[OpIdx].size() == getNumLanes
() && "Expected same num of lanes across all operands"
) ? void (0) : __assert_fail ("OpsVec[OpIdx].size() == getNumLanes() && \"Expected same num of lanes across all operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1598, __extension__
__PRETTY_FUNCTION__))
;
1599 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
1600 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
1601 return OpVL;
1602 }
1603
1604 // Performs operand reordering for 2 or more operands.
1605 // The original operands are in OrigOps[OpIdx][Lane].
1606 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
1607 void reorder() {
1608 unsigned NumOperands = getNumOperands();
1609 unsigned NumLanes = getNumLanes();
1610 // Each operand has its own mode. We are using this mode to help us select
1611 // the instructions for each lane, so that they match best with the ones
1612 // we have selected so far.
1613 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
1614
1615 // This is a greedy single-pass algorithm. We are going over each lane
1616 // once and deciding on the best order right away with no back-tracking.
1617 // However, in order to increase its effectiveness, we start with the lane
1618 // that has operands that can move the least. For example, given the
1619 // following lanes:
1620 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
1621 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
1622 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
1623 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
1624 // we will start at Lane 1, since the operands of the subtraction cannot
1625 // be reordered. Then we will visit the rest of the lanes in a circular
1626 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
1627
1628 // Find the first lane that we will start our search from.
1629 unsigned FirstLane = getBestLaneToStartReordering();
1630
1631 // Initialize the modes.
1632 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1633 Value *OpLane0 = getValue(OpIdx, FirstLane);
1634 // Keep track if we have instructions with all the same opcode on one
1635 // side.
1636 if (isa<LoadInst>(OpLane0))
1637 ReorderingModes[OpIdx] = ReorderingMode::Load;
1638 else if (isa<Instruction>(OpLane0)) {
1639 // Check if OpLane0 should be broadcast.
1640 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
1641 ReorderingModes[OpIdx] = ReorderingMode::Splat;
1642 else
1643 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
1644 }
1645 else if (isa<Constant>(OpLane0))
1646 ReorderingModes[OpIdx] = ReorderingMode::Constant;
1647 else if (isa<Argument>(OpLane0))
1648 // Our best hope is a Splat. It may save some cost in some cases.
1649 ReorderingModes[OpIdx] = ReorderingMode::Splat;
1650 else
1651 // NOTE: This should be unreachable.
1652 ReorderingModes[OpIdx] = ReorderingMode::Failed;
1653 }
1654
1655 // Check that we don't have same operands. No need to reorder if operands
1656 // are just perfect diamond or shuffled diamond match. Do not do it only
1657 // for possible broadcasts or non-power of 2 number of scalars (just for
1658 // now).
1659 auto &&SkipReordering = [this]() {
1660 SmallPtrSet<Value *, 4> UniqueValues;
1661 ArrayRef<OperandData> Op0 = OpsVec.front();
1662 for (const OperandData &Data : Op0)
1663 UniqueValues.insert(Data.V);
1664 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
1665 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
1666 return !UniqueValues.contains(Data.V);
1667 }))
1668 return false;
1669 }
1670 // TODO: Check if we can remove a check for non-power-2 number of
1671 // scalars after full support of non-power-2 vectorization.
1672 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
1673 };
1674
1675 // If the initial strategy fails for any of the operand indexes, then we
1676 // perform reordering again in a second pass. This helps avoid assigning
1677 // high priority to the failed strategy, and should improve reordering for
1678 // the non-failed operand indexes.
1679 for (int Pass = 0; Pass != 2; ++Pass) {
1680 // Check if no need to reorder operands since they're are perfect or
1681 // shuffled diamond match.
1682 // Need to to do it to avoid extra external use cost counting for
1683 // shuffled matches, which may cause regressions.
1684 if (SkipReordering())
1685 break;
1686 // Skip the second pass if the first pass did not fail.
1687 bool StrategyFailed = false;
1688 // Mark all operand data as free to use.
1689 clearUsed();
1690 // We keep the original operand order for the FirstLane, so reorder the
1691 // rest of the lanes. We are visiting the nodes in a circular fashion,
1692 // using FirstLane as the center point and increasing the radius
1693 // distance.
1694 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
1695 // Visit the lane on the right and then the lane on the left.
1696 for (int Direction : {+1, -1}) {
1697 int Lane = FirstLane + Direction * Distance;
1698 if (Lane < 0 || Lane >= (int)NumLanes)
1699 continue;
1700 int LastLane = Lane - Direction;
1701 assert(LastLane >= 0 && LastLane < (int)NumLanes &&(static_cast <bool> (LastLane >= 0 && LastLane
< (int)NumLanes && "Out of bounds") ? void (0) : __assert_fail
("LastLane >= 0 && LastLane < (int)NumLanes && \"Out of bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1702, __extension__
__PRETTY_FUNCTION__))
1702 "Out of bounds")(static_cast <bool> (LastLane >= 0 && LastLane
< (int)NumLanes && "Out of bounds") ? void (0) : __assert_fail
("LastLane >= 0 && LastLane < (int)NumLanes && \"Out of bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1702, __extension__
__PRETTY_FUNCTION__))
;
1703 // Look for a good match for each operand.
1704 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1705 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
1706 Optional<unsigned> BestIdx =
1707 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
1708 // By not selecting a value, we allow the operands that follow to
1709 // select a better matching value. We will get a non-null value in
1710 // the next run of getBestOperand().
1711 if (BestIdx) {
1712 // Swap the current operand with the one returned by
1713 // getBestOperand().
1714 swap(OpIdx, BestIdx.getValue(), Lane);
1715 } else {
1716 // We failed to find a best operand, set mode to 'Failed'.
1717 ReorderingModes[OpIdx] = ReorderingMode::Failed;
1718 // Enable the second pass.
1719 StrategyFailed = true;
1720 }
1721 }
1722 }
1723 }
1724 // Skip second pass if the strategy did not fail.
1725 if (!StrategyFailed)
1726 break;
1727 }
1728 }
1729
1730#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1731 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static StringRef getModeStr(ReorderingMode RMode) {
1732 switch (RMode) {
1733 case ReorderingMode::Load:
1734 return "Load";
1735 case ReorderingMode::Opcode:
1736 return "Opcode";
1737 case ReorderingMode::Constant:
1738 return "Constant";
1739 case ReorderingMode::Splat:
1740 return "Splat";
1741 case ReorderingMode::Failed:
1742 return "Failed";
1743 }
1744 llvm_unreachable("Unimplemented Reordering Type")::llvm::llvm_unreachable_internal("Unimplemented Reordering Type"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1744)
;
1745 }
1746
1747 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static raw_ostream &printMode(ReorderingMode RMode,
1748 raw_ostream &OS) {
1749 return OS << getModeStr(RMode);
1750 }
1751
1752 /// Debug print.
1753 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static void dumpMode(ReorderingMode RMode) {
1754 printMode(RMode, dbgs());
1755 }
1756
1757 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
1758 return printMode(RMode, OS);
1759 }
1760
1761 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) raw_ostream &print(raw_ostream &OS) const {
1762 const unsigned Indent = 2;
1763 unsigned Cnt = 0;
1764 for (const OperandDataVec &OpDataVec : OpsVec) {
1765 OS << "Operand " << Cnt++ << "\n";
1766 for (const OperandData &OpData : OpDataVec) {
1767 OS.indent(Indent) << "{";
1768 if (Value *V = OpData.V)
1769 OS << *V;
1770 else
1771 OS << "null";
1772 OS << ", APO:" << OpData.APO << "}\n";
1773 }
1774 OS << "\n";
1775 }
1776 return OS;
1777 }
1778
1779 /// Debug print.
1780 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { print(dbgs()); }
1781#endif
1782 };
1783
1784 /// Checks if the instruction is marked for deletion.
1785 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
1786
1787 /// Marks values operands for later deletion by replacing them with Undefs.
1788 void eraseInstructions(ArrayRef<Value *> AV);
1789
1790 ~BoUpSLP();
1791
1792private:
1793 /// Checks if all users of \p I are the part of the vectorization tree.
1794 bool areAllUsersVectorized(Instruction *I,
1795 ArrayRef<Value *> VectorizedVals) const;
1796
1797 /// \returns the cost of the vectorizable entry.
1798 InstructionCost getEntryCost(const TreeEntry *E,
1799 ArrayRef<Value *> VectorizedVals);
1800
1801 /// This is the recursive part of buildTree.
1802 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
1803 const EdgeInfo &EI);
1804
1805 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
1806 /// be vectorized to use the original vector (or aggregate "bitcast" to a
1807 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
1808 /// returns false, setting \p CurrentOrder to either an empty vector or a
1809 /// non-identity permutation that allows to reuse extract instructions.
1810 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
1811 SmallVectorImpl<unsigned> &CurrentOrder) const;
1812
1813 /// Vectorize a single entry in the tree.
1814 Value *vectorizeTree(TreeEntry *E);
1815
1816 /// Vectorize a single entry in the tree, starting in \p VL.
1817 Value *vectorizeTree(ArrayRef<Value *> VL);
1818
1819 /// \returns the scalarization cost for this type. Scalarization in this
1820 /// context means the creation of vectors from a group of scalars. If \p
1821 /// NeedToShuffle is true, need to add a cost of reshuffling some of the
1822 /// vector elements.
1823 InstructionCost getGatherCost(FixedVectorType *Ty,
1824 const DenseSet<unsigned> &ShuffledIndices,
1825 bool NeedToShuffle) const;
1826
1827 /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
1828 /// tree entries.
1829 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
1830 /// previous tree entries. \p Mask is filled with the shuffle mask.
1831 Optional<TargetTransformInfo::ShuffleKind>
1832 isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
1833 SmallVectorImpl<const TreeEntry *> &Entries);
1834
1835 /// \returns the scalarization cost for this list of values. Assuming that
1836 /// this subtree gets vectorized, we may need to extract the values from the
1837 /// roots. This method calculates the cost of extracting the values.
1838 InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
1839
1840 /// Set the Builder insert point to one after the last instruction in
1841 /// the bundle
1842 void setInsertPointAfterBundle(const TreeEntry *E);
1843
1844 /// \returns a vector from a collection of scalars in \p VL.
1845 Value *gather(ArrayRef<Value *> VL);
1846
1847 /// \returns whether the VectorizableTree is fully vectorizable and will
1848 /// be beneficial even the tree height is tiny.
1849 bool isFullyVectorizableTinyTree(bool ForReduction) const;
1850
1851 /// Reorder commutative or alt operands to get better probability of
1852 /// generating vectorized code.
1853 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
1854 SmallVectorImpl<Value *> &Left,
1855 SmallVectorImpl<Value *> &Right,
1856 const DataLayout &DL,
1857 ScalarEvolution &SE,
1858 const BoUpSLP &R);
1859 struct TreeEntry {
1860 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
1861 TreeEntry(VecTreeTy &Container) : Container(Container) {}
1862
1863 /// \returns true if the scalars in VL are equal to this entry.
1864 bool isSame(ArrayRef<Value *> VL) const {
1865 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
1866 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
1867 return std::equal(VL.begin(), VL.end(), Scalars.begin());
1868 return VL.size() == Mask.size() &&
1869 std::equal(VL.begin(), VL.end(), Mask.begin(),
1870 [Scalars](Value *V, int Idx) {
1871 return (isa<UndefValue>(V) &&
1872 Idx == UndefMaskElem) ||
1873 (Idx != UndefMaskElem && V == Scalars[Idx]);
1874 });
1875 };
1876 if (!ReorderIndices.empty()) {
1877 // TODO: implement matching if the nodes are just reordered, still can
1878 // treat the vector as the same if the list of scalars matches VL
1879 // directly, without reordering.
1880 SmallVector<int> Mask;
1881 inversePermutation(ReorderIndices, Mask);
1882 if (VL.size() == Scalars.size())
1883 return IsSame(Scalars, Mask);
1884 if (VL.size() == ReuseShuffleIndices.size()) {
1885 ::addMask(Mask, ReuseShuffleIndices);
1886 return IsSame(Scalars, Mask);
1887 }
1888 return false;
1889 }
1890 return IsSame(Scalars, ReuseShuffleIndices);
1891 }
1892
1893 /// \returns true if current entry has same operands as \p TE.
1894 bool hasEqualOperands(const TreeEntry &TE) const {
1895 if (TE.getNumOperands() != getNumOperands())
1896 return false;
1897 SmallBitVector Used(getNumOperands());
1898 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
1899 unsigned PrevCount = Used.count();
1900 for (unsigned K = 0; K < E; ++K) {
1901 if (Used.test(K))
1902 continue;
1903 if (getOperand(K) == TE.getOperand(I)) {
1904 Used.set(K);
1905 break;
1906 }
1907 }
1908 // Check if we actually found the matching operand.
1909 if (PrevCount == Used.count())
1910 return false;
1911 }
1912 return true;
1913 }
1914
1915 /// \return Final vectorization factor for the node. Defined by the total
1916 /// number of vectorized scalars, including those, used several times in the
1917 /// entry and counted in the \a ReuseShuffleIndices, if any.
1918 unsigned getVectorFactor() const {
1919 if (!ReuseShuffleIndices.empty())
1920 return ReuseShuffleIndices.size();
1921 return Scalars.size();
1922 };
1923
1924 /// A vector of scalars.
1925 ValueList Scalars;
1926
1927 /// The Scalars are vectorized into this value. It is initialized to Null.
1928 Value *VectorizedValue = nullptr;
1929
1930 /// Do we need to gather this sequence or vectorize it
1931 /// (either with vector instruction or with scatter/gather
1932 /// intrinsics for store/load)?
1933 enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
1934 EntryState State;
1935
1936 /// Does this sequence require some shuffling?
1937 SmallVector<int, 4> ReuseShuffleIndices;
1938
1939 /// Does this entry require reordering?
1940 SmallVector<unsigned, 4> ReorderIndices;
1941
1942 /// Points back to the VectorizableTree.
1943 ///
1944 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
1945 /// to be a pointer and needs to be able to initialize the child iterator.
1946 /// Thus we need a reference back to the container to translate the indices
1947 /// to entries.
1948 VecTreeTy &Container;
1949
1950 /// The TreeEntry index containing the user of this entry. We can actually
1951 /// have multiple users so the data structure is not truly a tree.
1952 SmallVector<EdgeInfo, 1> UserTreeIndices;
1953
1954 /// The index of this treeEntry in VectorizableTree.
1955 int Idx = -1;
1956
1957 private:
1958 /// The operands of each instruction in each lane Operands[op_index][lane].
1959 /// Note: This helps avoid the replication of the code that performs the
1960 /// reordering of operands during buildTree_rec() and vectorizeTree().
1961 SmallVector<ValueList, 2> Operands;
1962
1963 /// The main/alternate instruction.
1964 Instruction *MainOp = nullptr;
1965 Instruction *AltOp = nullptr;
1966
1967 public:
1968 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
1969 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
1970 if (Operands.size() < OpIdx + 1)
1971 Operands.resize(OpIdx + 1);
1972 assert(Operands[OpIdx].empty() && "Already resized?")(static_cast <bool> (Operands[OpIdx].empty() &&
"Already resized?") ? void (0) : __assert_fail ("Operands[OpIdx].empty() && \"Already resized?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1972, __extension__
__PRETTY_FUNCTION__))
;
1973 assert(OpVL.size() <= Scalars.size() &&(static_cast <bool> (OpVL.size() <= Scalars.size() &&
"Number of operands is greater than the number of scalars.")
? void (0) : __assert_fail ("OpVL.size() <= Scalars.size() && \"Number of operands is greater than the number of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1974, __extension__
__PRETTY_FUNCTION__))
1974 "Number of operands is greater than the number of scalars.")(static_cast <bool> (OpVL.size() <= Scalars.size() &&
"Number of operands is greater than the number of scalars.")
? void (0) : __assert_fail ("OpVL.size() <= Scalars.size() && \"Number of operands is greater than the number of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1974, __extension__
__PRETTY_FUNCTION__))
;
1975 Operands[OpIdx].resize(OpVL.size());
1976 copy(OpVL, Operands[OpIdx].begin());
1977 }
1978
1979 /// Set the operands of this bundle in their original order.
1980 void setOperandsInOrder() {
1981 assert(Operands.empty() && "Already initialized?")(static_cast <bool> (Operands.empty() && "Already initialized?"
) ? void (0) : __assert_fail ("Operands.empty() && \"Already initialized?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1981, __extension__
__PRETTY_FUNCTION__))
;
1982 auto *I0 = cast<Instruction>(Scalars[0]);
1983 Operands.resize(I0->getNumOperands());
1984 unsigned NumLanes = Scalars.size();
1985 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
1986 OpIdx != NumOperands; ++OpIdx) {
1987 Operands[OpIdx].resize(NumLanes);
1988 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1989 auto *I = cast<Instruction>(Scalars[Lane]);
1990 assert(I->getNumOperands() == NumOperands &&(static_cast <bool> (I->getNumOperands() == NumOperands
&& "Expected same number of operands") ? void (0) : __assert_fail
("I->getNumOperands() == NumOperands && \"Expected same number of operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1991, __extension__
__PRETTY_FUNCTION__))
1991 "Expected same number of operands")(static_cast <bool> (I->getNumOperands() == NumOperands
&& "Expected same number of operands") ? void (0) : __assert_fail
("I->getNumOperands() == NumOperands && \"Expected same number of operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1991, __extension__
__PRETTY_FUNCTION__))
;
1992 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
1993 }
1994 }
1995 }
1996
1997 /// Reorders operands of the node to the given mask \p Mask.
1998 void reorderOperands(ArrayRef<int> Mask) {
1999 for (ValueList &Operand : Operands)
2000 reorderScalars(Operand, Mask);
2001 }
2002
2003 /// \returns the \p OpIdx operand of this TreeEntry.
2004 ValueList &getOperand(unsigned OpIdx) {
2005 assert(OpIdx < Operands.size() && "Off bounds")(static_cast <bool> (OpIdx < Operands.size() &&
"Off bounds") ? void (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2005, __extension__
__PRETTY_FUNCTION__))
;
2006 return Operands[OpIdx];
2007 }
2008
2009 /// \returns the \p OpIdx operand of this TreeEntry.
2010 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2011 assert(OpIdx < Operands.size() && "Off bounds")(static_cast <bool> (OpIdx < Operands.size() &&
"Off bounds") ? void (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2011, __extension__
__PRETTY_FUNCTION__))
;
2012 return Operands[OpIdx];
2013 }
2014
2015 /// \returns the number of operands.
2016 unsigned getNumOperands() const { return Operands.size(); }
2017
2018 /// \return the single \p OpIdx operand.
2019 Value *getSingleOperand(unsigned OpIdx) const {
2020 assert(OpIdx < Operands.size() && "Off bounds")(static_cast <bool> (OpIdx < Operands.size() &&
"Off bounds") ? void (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2020, __extension__
__PRETTY_FUNCTION__))
;
2021 assert(!Operands[OpIdx].empty() && "No operand available")(static_cast <bool> (!Operands[OpIdx].empty() &&
"No operand available") ? void (0) : __assert_fail ("!Operands[OpIdx].empty() && \"No operand available\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2021, __extension__
__PRETTY_FUNCTION__))
;
2022 return Operands[OpIdx][0];
2023 }
2024
2025 /// Some of the instructions in the list have alternate opcodes.
2026 bool isAltShuffle() const { return MainOp != AltOp; }
2027
2028 bool isOpcodeOrAlt(Instruction *I) const {
2029 unsigned CheckedOpcode = I->getOpcode();
2030 return (getOpcode() == CheckedOpcode ||
2031 getAltOpcode() == CheckedOpcode);
2032 }
2033
2034 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2035 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2036 /// \p OpValue.
2037 Value *isOneOf(Value *Op) const {
2038 auto *I = dyn_cast<Instruction>(Op);
2039 if (I && isOpcodeOrAlt(I))
2040 return Op;
2041 return MainOp;
2042 }
2043
2044 void setOperations(const InstructionsState &S) {
2045 MainOp = S.MainOp;
2046 AltOp = S.AltOp;
2047 }
2048
2049 Instruction *getMainOp() const {
2050 return MainOp;
2051 }
2052
2053 Instruction *getAltOp() const {
2054 return AltOp;
2055 }
2056
2057 /// The main/alternate opcodes for the list of instructions.
2058 unsigned getOpcode() const {
2059 return MainOp ? MainOp->getOpcode() : 0;
2060 }
2061
2062 unsigned getAltOpcode() const {
2063 return AltOp ? AltOp->getOpcode() : 0;
2064 }
2065
2066 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2067 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2068 int findLaneForValue(Value *V) const {
2069 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2070 assert(FoundLane < Scalars.size() && "Couldn't find extract lane")(static_cast <bool> (FoundLane < Scalars.size() &&
"Couldn't find extract lane") ? void (0) : __assert_fail ("FoundLane < Scalars.size() && \"Couldn't find extract lane\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2070, __extension__
__PRETTY_FUNCTION__))
;
2071 if (!ReorderIndices.empty())
2072 FoundLane = ReorderIndices[FoundLane];
2073 assert(FoundLane < Scalars.size() && "Couldn't find extract lane")(static_cast <bool> (FoundLane < Scalars.size() &&
"Couldn't find extract lane") ? void (0) : __assert_fail ("FoundLane < Scalars.size() && \"Couldn't find extract lane\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2073, __extension__
__PRETTY_FUNCTION__))
;
2074 if (!ReuseShuffleIndices.empty()) {
2075 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2076 find(ReuseShuffleIndices, FoundLane));
2077 }
2078 return FoundLane;
2079 }
2080
2081#ifndef NDEBUG
2082 /// Debug printer.
2083 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const {
2084 dbgs() << Idx << ".\n";
2085 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2086 dbgs() << "Operand " << OpI << ":\n";
2087 for (const Value *V : Operands[OpI])
2088 dbgs().indent(2) << *V << "\n";
2089 }
2090 dbgs() << "Scalars: \n";
2091 for (Value *V : Scalars)
2092 dbgs().indent(2) << *V << "\n";
2093 dbgs() << "State: ";
2094 switch (State) {
2095 case Vectorize:
2096 dbgs() << "Vectorize\n";
2097 break;
2098 case ScatterVectorize:
2099 dbgs() << "ScatterVectorize\n";
2100 break;
2101 case NeedToGather:
2102 dbgs() << "NeedToGather\n";
2103 break;
2104 }
2105 dbgs() << "MainOp: ";
2106 if (MainOp)
2107 dbgs() << *MainOp << "\n";
2108 else
2109 dbgs() << "NULL\n";
2110 dbgs() << "AltOp: ";
2111 if (AltOp)
2112 dbgs() << *AltOp << "\n";
2113 else
2114 dbgs() << "NULL\n";
2115 dbgs() << "VectorizedValue: ";
2116 if (VectorizedValue)
2117 dbgs() << *VectorizedValue << "\n";
2118 else
2119 dbgs() << "NULL\n";
2120 dbgs() << "ReuseShuffleIndices: ";
2121 if (ReuseShuffleIndices.empty())
2122 dbgs() << "Empty";
2123 else
2124 for (int ReuseIdx : ReuseShuffleIndices)
2125 dbgs() << ReuseIdx << ", ";
2126 dbgs() << "\n";
2127 dbgs() << "ReorderIndices: ";
2128 for (unsigned ReorderIdx : ReorderIndices)
2129 dbgs() << ReorderIdx << ", ";
2130 dbgs() << "\n";
2131 dbgs() << "UserTreeIndices: ";
2132 for (const auto &EInfo : UserTreeIndices)
2133 dbgs() << EInfo << ", ";
2134 dbgs() << "\n";
2135 }
2136#endif
2137 };
2138
2139#ifndef NDEBUG
2140 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2141 InstructionCost VecCost,
2142 InstructionCost ScalarCost) const {
2143 dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
2144 dbgs() << "SLP: Costs:\n";
2145 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2146 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2147 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2148 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<
2149 ReuseShuffleCost + VecCost - ScalarCost << "\n";
2150 }
2151#endif
2152
2153 /// Create a new VectorizableTree entry.
2154 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
2155 const InstructionsState &S,
2156 const EdgeInfo &UserTreeIdx,
2157 ArrayRef<int> ReuseShuffleIndices = None,
2158 ArrayRef<unsigned> ReorderIndices = None) {
2159 TreeEntry::EntryState EntryState =
2160 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2161 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2162 ReuseShuffleIndices, ReorderIndices);
2163 }
2164
2165 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2166 TreeEntry::EntryState EntryState,
2167 Optional<ScheduleData *> Bundle,
2168 const InstructionsState &S,
2169 const EdgeInfo &UserTreeIdx,
2170 ArrayRef<int> ReuseShuffleIndices = None,
2171 ArrayRef<unsigned> ReorderIndices = None) {
2172 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||(static_cast <bool> (((!Bundle && EntryState ==
TreeEntry::NeedToGather) || (Bundle && EntryState !=
TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"
) ? void (0) : __assert_fail ("((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && \"Need to vectorize gather entry?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2174, __extension__
__PRETTY_FUNCTION__))
2173 (Bundle && EntryState != TreeEntry::NeedToGather)) &&(static_cast <bool> (((!Bundle && EntryState ==
TreeEntry::NeedToGather) || (Bundle && EntryState !=
TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"
) ? void (0) : __assert_fail ("((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && \"Need to vectorize gather entry?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2174, __extension__
__PRETTY_FUNCTION__))
2174 "Need to vectorize gather entry?")(static_cast <bool> (((!Bundle && EntryState ==
TreeEntry::NeedToGather) || (Bundle && EntryState !=
TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"
) ? void (0) : __assert_fail ("((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && \"Need to vectorize gather entry?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2174, __extension__
__PRETTY_FUNCTION__))
;
2175 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2176 TreeEntry *Last = VectorizableTree.back().get();
2177 Last->Idx = VectorizableTree.size() - 1;
2178 Last->State = EntryState;
2179 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2180 ReuseShuffleIndices.end());
2181 if (ReorderIndices.empty()) {
2182 Last->Scalars.assign(VL.begin(), VL.end());
2183 Last->setOperations(S);
2184 } else {
2185 // Reorder scalars and build final mask.
2186 Last->Scalars.assign(VL.size(), nullptr);
2187 transform(ReorderIndices, Last->Scalars.begin(),
2188 [VL](unsigned Idx) -> Value * {
2189 if (Idx >= VL.size())
2190 return UndefValue::get(VL.front()->getType());
2191 return VL[Idx];
2192 });
2193 InstructionsState S = getSameOpcode(Last->Scalars);
2194 Last->setOperations(S);
2195 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2196 }
2197 if (Last->State != TreeEntry::NeedToGather) {
2198 for (Value *V : VL) {
2199 assert(!getTreeEntry(V) && "Scalar already in tree!")(static_cast <bool> (!getTreeEntry(V) && "Scalar already in tree!"
) ? void (0) : __assert_fail ("!getTreeEntry(V) && \"Scalar already in tree!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2199, __extension__
__PRETTY_FUNCTION__))
;
2200 ScalarToTreeEntry[V] = Last;
2201 }
2202 // Update the scheduler bundle to point to this TreeEntry.
2203 unsigned Lane = 0;
2204 for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
2205 BundleMember = BundleMember->NextInBundle) {
2206 BundleMember->TE = Last;
2207 BundleMember->Lane = Lane;
2208 ++Lane;
2209 }
2210 assert((!Bundle.getValue() || Lane == VL.size()) &&(static_cast <bool> ((!Bundle.getValue() || Lane == VL.
size()) && "Bundle and VL out of sync") ? void (0) : __assert_fail
("(!Bundle.getValue() || Lane == VL.size()) && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2211, __extension__
__PRETTY_FUNCTION__))
2211 "Bundle and VL out of sync")(static_cast <bool> ((!Bundle.getValue() || Lane == VL.
size()) && "Bundle and VL out of sync") ? void (0) : __assert_fail
("(!Bundle.getValue() || Lane == VL.size()) && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2211, __extension__
__PRETTY_FUNCTION__))
;
2212 } else {
2213 MustGather.insert(VL.begin(), VL.end());
2214 }
2215
2216 if (UserTreeIdx.UserTE)
2217 Last->UserTreeIndices.push_back(UserTreeIdx);
2218
2219 return Last;
2220 }
2221
2222 /// -- Vectorization State --
2223 /// Holds all of the tree entries.
2224 TreeEntry::VecTreeTy VectorizableTree;
2225
2226#ifndef NDEBUG
2227 /// Debug printer.
2228 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dumpVectorizableTree() const {
2229 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2230 VectorizableTree[Id]->dump();
2231 dbgs() << "\n";
2232 }
2233 }
2234#endif
2235
2236 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
2237
2238 const TreeEntry *getTreeEntry(Value *V) const {
2239 return ScalarToTreeEntry.lookup(V);
2240 }
2241
2242 /// Maps a specific scalar to its tree entry.
2243 SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
2244
2245 /// Maps a value to the proposed vectorizable size.
2246 SmallDenseMap<Value *, unsigned> InstrElementSize;
2247
2248 /// A list of scalars that we found that we need to keep as scalars.
2249 ValueSet MustGather;
2250
2251 /// This POD struct describes one external user in the vectorized tree.
2252 struct ExternalUser {
2253 ExternalUser(Value *S, llvm::User *U, int L)
2254 : Scalar(S), User(U), Lane(L) {}
2255
2256 // Which scalar in our function.
2257 Value *Scalar;
2258
2259 // Which user that uses the scalar.
2260 llvm::User *User;
2261
2262 // Which lane does the scalar belong to.
2263 int Lane;
2264 };
2265 using UserList = SmallVector<ExternalUser, 16>;
2266
2267 /// Checks if two instructions may access the same memory.
2268 ///
2269 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
2270 /// is invariant in the calling loop.
2271 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
2272 Instruction *Inst2) {
2273 // First check if the result is already in the cache.
2274 AliasCacheKey key = std::make_pair(Inst1, Inst2);
2275 Optional<bool> &result = AliasCache[key];
2276 if (result.hasValue()) {
2277 return result.getValue();
2278 }
2279 bool aliased = true;
2280 if (Loc1.Ptr && isSimple(Inst1))
2281 aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1));
2282 // Store the result in the cache.
2283 result = aliased;
2284 return aliased;
2285 }
2286
2287 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
2288
2289 /// Cache for alias results.
2290 /// TODO: consider moving this to the AliasAnalysis itself.
2291 DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
2292
2293 /// Removes an instruction from its block and eventually deletes it.
2294 /// It's like Instruction::eraseFromParent() except that the actual deletion
2295 /// is delayed until BoUpSLP is destructed.
2296 /// This is required to ensure that there are no incorrect collisions in the
2297 /// AliasCache, which can happen if a new instruction is allocated at the
2298 /// same address as a previously deleted instruction.
2299 void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
2300 auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
2301 It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
2302 }
2303
2304 /// Temporary store for deleted instructions. Instructions will be deleted
2305 /// eventually when the BoUpSLP is destructed.
2306 DenseMap<Instruction *, bool> DeletedInstructions;
2307
2308 /// A list of values that need to extracted out of the tree.
2309 /// This list holds pairs of (Internal Scalar : External User). External User
2310 /// can be nullptr, it means that this Internal Scalar will be used later,
2311 /// after vectorization.
2312 UserList ExternalUses;
2313
2314 /// Values used only by @llvm.assume calls.
2315 SmallPtrSet<const Value *, 32> EphValues;
2316
2317 /// Holds all of the instructions that we gathered.
2318 SetVector<Instruction *> GatherShuffleSeq;
2319
2320 /// A list of blocks that we are going to CSE.
2321 SetVector<BasicBlock *> CSEBlocks;
2322
2323 /// Contains all scheduling relevant data for an instruction.
2324 /// A ScheduleData either represents a single instruction or a member of an
2325 /// instruction bundle (= a group of instructions which is combined into a
2326 /// vector instruction).
2327 struct ScheduleData {
2328 // The initial value for the dependency counters. It means that the
2329 // dependencies are not calculated yet.
2330 enum { InvalidDeps = -1 };
2331
2332 ScheduleData() = default;
2333
2334 void init(int BlockSchedulingRegionID, Value *OpVal) {
2335 FirstInBundle = this;
2336 NextInBundle = nullptr;
2337 NextLoadStore = nullptr;
2338 IsScheduled = false;
2339 SchedulingRegionID = BlockSchedulingRegionID;
2340 UnscheduledDepsInBundle = UnscheduledDeps;
2341 clearDependencies();
2342 OpValue = OpVal;
2343 TE = nullptr;
2344 Lane = -1;
2345 }
2346
2347 /// Returns true if the dependency information has been calculated.
2348 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
2349
2350 /// Returns true for single instructions and for bundle representatives
2351 /// (= the head of a bundle).
2352 bool isSchedulingEntity() const { return FirstInBundle == this; }
2353
2354 /// Returns true if it represents an instruction bundle and not only a
2355 /// single instruction.
2356 bool isPartOfBundle() const {
2357 return NextInBundle != nullptr || FirstInBundle != this;
2358 }
2359
2360 /// Returns true if it is ready for scheduling, i.e. it has no more
2361 /// unscheduled depending instructions/bundles.
2362 bool isReady() const {
2363 assert(isSchedulingEntity() &&(static_cast <bool> (isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2364, __extension__
__PRETTY_FUNCTION__))
2364 "can't consider non-scheduling entity for ready list")(static_cast <bool> (isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2364, __extension__
__PRETTY_FUNCTION__))
;
2365 return UnscheduledDepsInBundle == 0 && !IsScheduled;
2366 }
2367
2368 /// Modifies the number of unscheduled dependencies, also updating it for
2369 /// the whole bundle.
2370 int incrementUnscheduledDeps(int Incr) {
2371 UnscheduledDeps += Incr;
2372 return FirstInBundle->UnscheduledDepsInBundle += Incr;
2373 }
2374
2375 /// Sets the number of unscheduled dependencies to the number of
2376 /// dependencies.
2377 void resetUnscheduledDeps() {
2378 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
2379 }
2380
2381 /// Clears all dependency information.
2382 void clearDependencies() {
2383 Dependencies = InvalidDeps;
2384 resetUnscheduledDeps();
2385 MemoryDependencies.clear();
2386 }
2387
2388 void dump(raw_ostream &os) const {
2389 if (!isSchedulingEntity()) {
2390 os << "/ " << *Inst;
2391 } else if (NextInBundle) {
2392 os << '[' << *Inst;
2393 ScheduleData *SD = NextInBundle;
2394 while (SD) {
2395 os << ';' << *SD->Inst;
2396 SD = SD->NextInBundle;
2397 }
2398 os << ']';
2399 } else {
2400 os << *Inst;
2401 }
2402 }
2403
2404 Instruction *Inst = nullptr;
2405
2406 /// Points to the head in an instruction bundle (and always to this for
2407 /// single instructions).
2408 ScheduleData *FirstInBundle = nullptr;
2409
2410 /// Single linked list of all instructions in a bundle. Null if it is a
2411 /// single instruction.
2412 ScheduleData *NextInBundle = nullptr;
2413
2414 /// Single linked list of all memory instructions (e.g. load, store, call)
2415 /// in the block - until the end of the scheduling region.
2416 ScheduleData *NextLoadStore = nullptr;
2417
2418 /// The dependent memory instructions.
2419 /// This list is derived on demand in calculateDependencies().
2420 SmallVector<ScheduleData *, 4> MemoryDependencies;
2421
2422 /// This ScheduleData is in the current scheduling region if this matches
2423 /// the current SchedulingRegionID of BlockScheduling.
2424 int SchedulingRegionID = 0;
2425
2426 /// Used for getting a "good" final ordering of instructions.
2427 int SchedulingPriority = 0;
2428
2429 /// The number of dependencies. Constitutes of the number of users of the
2430 /// instruction plus the number of dependent memory instructions (if any).
2431 /// This value is calculated on demand.
2432 /// If InvalidDeps, the number of dependencies is not calculated yet.
2433 int Dependencies = InvalidDeps;
2434
2435 /// The number of dependencies minus the number of dependencies of scheduled
2436 /// instructions. As soon as this is zero, the instruction/bundle gets ready
2437 /// for scheduling.
2438 /// Note that this is negative as long as Dependencies is not calculated.
2439 int UnscheduledDeps = InvalidDeps;
2440
2441 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
2442 /// single instructions.
2443 int UnscheduledDepsInBundle = InvalidDeps;
2444
2445 /// True if this instruction is scheduled (or considered as scheduled in the
2446 /// dry-run).
2447 bool IsScheduled = false;
2448
2449 /// Opcode of the current instruction in the schedule data.
2450 Value *OpValue = nullptr;
2451
2452 /// The TreeEntry that this instruction corresponds to.
2453 TreeEntry *TE = nullptr;
2454
2455 /// The lane of this node in the TreeEntry.
2456 int Lane = -1;
2457 };
2458
2459#ifndef NDEBUG
2460 friend inline raw_ostream &operator<<(raw_ostream &os,
2461 const BoUpSLP::ScheduleData &SD) {
2462 SD.dump(os);
2463 return os;
2464 }
2465#endif
2466
2467 friend struct GraphTraits<BoUpSLP *>;
2468 friend struct DOTGraphTraits<BoUpSLP *>;
2469
2470 /// Contains all scheduling data for a basic block.
2471 struct BlockScheduling {
2472 BlockScheduling(BasicBlock *BB)
2473 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
2474
2475 void clear() {
2476 ReadyInsts.clear();
2477 ScheduleStart = nullptr;
2478 ScheduleEnd = nullptr;
2479 FirstLoadStoreInRegion = nullptr;
2480 LastLoadStoreInRegion = nullptr;
2481
2482 // Reduce the maximum schedule region size by the size of the
2483 // previous scheduling run.
2484 ScheduleRegionSizeLimit -= ScheduleRegionSize;
2485 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
2486 ScheduleRegionSizeLimit = MinScheduleRegionSize;
2487 ScheduleRegionSize = 0;
2488
2489 // Make a new scheduling region, i.e. all existing ScheduleData is not
2490 // in the new region yet.
2491 ++SchedulingRegionID;
2492 }
2493
2494 ScheduleData *getScheduleData(Value *V) {
2495 ScheduleData *SD = ScheduleDataMap[V];
2496 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
2497 return SD;
2498 return nullptr;
2499 }
2500
2501 ScheduleData *getScheduleData(Value *V, Value *Key) {
2502 if (V == Key)
2503 return getScheduleData(V);
2504 auto I = ExtraScheduleDataMap.find(V);
2505 if (I != ExtraScheduleDataMap.end()) {
2506 ScheduleData *SD = I->second[Key];
2507 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
2508 return SD;
2509 }
2510 return nullptr;
2511 }
2512
2513 bool isInSchedulingRegion(ScheduleData *SD) const {
2514 return SD->SchedulingRegionID == SchedulingRegionID;
2515 }
2516
2517 /// Marks an instruction as scheduled and puts all dependent ready
2518 /// instructions into the ready-list.
2519 template <typename ReadyListType>
2520 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
2521 SD->IsScheduled = true;
15
Access to field 'IsScheduled' results in a dereference of a null pointer (loaded from variable 'SD')
2522 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule " << *SD <<
"\n"; } } while (false)
;
2523
2524 ScheduleData *BundleMember = SD;
2525 while (BundleMember) {
2526 if (BundleMember->Inst != BundleMember->OpValue) {
2527 BundleMember = BundleMember->NextInBundle;
2528 continue;
2529 }
2530 // Handle the def-use chain dependencies.
2531
2532 // Decrement the unscheduled counter and insert to ready list if ready.
2533 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
2534 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
2535 if (OpDef && OpDef->hasValidDependencies() &&
2536 OpDef->incrementUnscheduledDeps(-1) == 0) {
2537 // There are no more unscheduled dependencies after
2538 // decrementing, so we can put the dependent instruction
2539 // into the ready list.
2540 ScheduleData *DepBundle = OpDef->FirstInBundle;
2541 assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2542, __extension__
__PRETTY_FUNCTION__))
2542 "already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2542, __extension__
__PRETTY_FUNCTION__))
;
2543 ReadyList.insert(DepBundle);
2544 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)
2545 << "SLP: gets ready (def): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)
;
2546 }
2547 });
2548 };
2549
2550 // If BundleMember is a vector bundle, its operands may have been
2551 // reordered duiring buildTree(). We therefore need to get its operands
2552 // through the TreeEntry.
2553 if (TreeEntry *TE = BundleMember->TE) {
2554 int Lane = BundleMember->Lane;
2555 assert(Lane >= 0 && "Lane not set")(static_cast <bool> (Lane >= 0 && "Lane not set"
) ? void (0) : __assert_fail ("Lane >= 0 && \"Lane not set\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2555, __extension__
__PRETTY_FUNCTION__))
;
2556
2557 // Since vectorization tree is being built recursively this assertion
2558 // ensures that the tree entry has all operands set before reaching
2559 // this code. Couple of exceptions known at the moment are extracts
2560 // where their second (immediate) operand is not added. Since
2561 // immediates do not affect scheduler behavior this is considered
2562 // okay.
2563 auto *In = TE->getMainOp();
2564 assert(In &&(static_cast <bool> (In && (isa<ExtractValueInst
>(In) || isa<ExtractElementInst>(In) || In->getNumOperands
() == TE->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2567, __extension__
__PRETTY_FUNCTION__))
2565 (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||(static_cast <bool> (In && (isa<ExtractValueInst
>(In) || isa<ExtractElementInst>(In) || In->getNumOperands
() == TE->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2567, __extension__
__PRETTY_FUNCTION__))
2566 In->getNumOperands() == TE->getNumOperands()) &&(static_cast <bool> (In && (isa<ExtractValueInst
>(In) || isa<ExtractElementInst>(In) || In->getNumOperands
() == TE->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2567, __extension__
__PRETTY_FUNCTION__))
2567 "Missed TreeEntry operands?")(static_cast <bool> (In && (isa<ExtractValueInst
>(In) || isa<ExtractElementInst>(In) || In->getNumOperands
() == TE->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2567, __extension__
__PRETTY_FUNCTION__))
;
2568 (void)In; // fake use to avoid build failure when assertions disabled
2569
2570 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
2571 OpIdx != NumOperands; ++OpIdx)
2572 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
2573 DecrUnsched(I);
2574 } else {
2575 // If BundleMember is a stand-alone instruction, no operand reordering
2576 // has taken place, so we directly access its operands.
2577 for (Use &U : BundleMember->Inst->operands())
2578 if (auto *I = dyn_cast<Instruction>(U.get()))
2579 DecrUnsched(I);
2580 }
2581 // Handle the memory dependencies.
2582 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
2583 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
2584 // There are no more unscheduled dependencies after decrementing,
2585 // so we can put the dependent instruction into the ready list.
2586 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
2587 assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2588, __extension__
__PRETTY_FUNCTION__))
2588 "already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2588, __extension__
__PRETTY_FUNCTION__))
;
2589 ReadyList.insert(DepBundle);
2590 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)
2591 << "SLP: gets ready (mem): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)
;
2592 }
2593 }
2594 BundleMember = BundleMember->NextInBundle;
2595 }
2596 }
2597
2598 void doForAllOpcodes(Value *V,
2599 function_ref<void(ScheduleData *SD)> Action) {
2600 if (ScheduleData *SD = getScheduleData(V))
2601 Action(SD);
2602 auto I = ExtraScheduleDataMap.find(V);
2603 if (I != ExtraScheduleDataMap.end())
2604 for (auto &P : I->second)
2605 if (P.second->SchedulingRegionID == SchedulingRegionID)
2606 Action(P.second);
2607 }
2608
2609 /// Put all instructions into the ReadyList which are ready for scheduling.
2610 template <typename ReadyListType>
2611 void initialFillReadyList(ReadyListType &ReadyList) {
2612 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2613 doForAllOpcodes(I, [&](ScheduleData *SD) {
2614 if (SD->isSchedulingEntity() && SD->isReady()) {
2615 ReadyList.insert(SD);
2616 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *I << "\n"; } } while (false)
2617 << "SLP: initially in ready list: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *I << "\n"; } } while (false)
;
2618 }
2619 });
2620 }
2621 }
2622
2623 /// Checks if a bundle of instructions can be scheduled, i.e. has no
2624 /// cyclic dependencies. This is only a dry-run, no instructions are
2625 /// actually moved at this stage.
2626 /// \returns the scheduling bundle. The returned Optional value is non-None
2627 /// if \p VL is allowed to be scheduled.
2628 Optional<ScheduleData *>
2629 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
2630 const InstructionsState &S);
2631
2632 /// Un-bundles a group of instructions.
2633 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
2634
2635 /// Allocates schedule data chunk.
2636 ScheduleData *allocateScheduleDataChunks();
2637
2638 /// Extends the scheduling region so that V is inside the region.
2639 /// \returns true if the region size is within the limit.
2640 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
2641
2642 /// Initialize the ScheduleData structures for new instructions in the
2643 /// scheduling region.
2644 void initScheduleData(Instruction *FromI, Instruction *ToI,
2645 ScheduleData *PrevLoadStore,
2646 ScheduleData *NextLoadStore);
2647
2648 /// Updates the dependency information of a bundle and of all instructions/
2649 /// bundles which depend on the original bundle.
2650 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
2651 BoUpSLP *SLP);
2652
2653 /// Sets all instruction in the scheduling region to un-scheduled.
2654 void resetSchedule();
2655
2656 BasicBlock *BB;
2657
2658 /// Simple memory allocation for ScheduleData.
2659 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
2660
2661 /// The size of a ScheduleData array in ScheduleDataChunks.
2662 int ChunkSize;
2663
2664 /// The allocator position in the current chunk, which is the last entry
2665 /// of ScheduleDataChunks.
2666 int ChunkPos;
2667
2668 /// Attaches ScheduleData to Instruction.
2669 /// Note that the mapping survives during all vectorization iterations, i.e.
2670 /// ScheduleData structures are recycled.
2671 DenseMap<Value *, ScheduleData *> ScheduleDataMap;
2672
2673 /// Attaches ScheduleData to Instruction with the leading key.
2674 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
2675 ExtraScheduleDataMap;
2676
2677 struct ReadyList : SmallVector<ScheduleData *, 8> {
2678 void insert(ScheduleData *SD) { push_back(SD); }
2679 };
2680
2681 /// The ready-list for scheduling (only used for the dry-run).
2682 ReadyList ReadyInsts;
2683
2684 /// The first instruction of the scheduling region.
2685 Instruction *ScheduleStart = nullptr;
2686
2687 /// The first instruction _after_ the scheduling region.
2688 Instruction *ScheduleEnd = nullptr;
2689
2690 /// The first memory accessing instruction in the scheduling region
2691 /// (can be null).
2692 ScheduleData *FirstLoadStoreInRegion = nullptr;
2693
2694 /// The last memory accessing instruction in the scheduling region
2695 /// (can be null).
2696 ScheduleData *LastLoadStoreInRegion = nullptr;
2697
2698 /// The current size of the scheduling region.
2699 int ScheduleRegionSize = 0;
2700
2701 /// The maximum size allowed for the scheduling region.
2702 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
2703
2704 /// The ID of the scheduling region. For a new vectorization iteration this
2705 /// is incremented which "removes" all ScheduleData from the region.
2706 // Make sure that the initial SchedulingRegionID is greater than the
2707 // initial SchedulingRegionID in ScheduleData (which is 0).
2708 int SchedulingRegionID = 1;
2709 };
2710
2711 /// Attaches the BlockScheduling structures to basic blocks.
2712 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
2713
2714 /// Performs the "real" scheduling. Done before vectorization is actually
2715 /// performed in a basic block.
2716 void scheduleBlock(BlockScheduling *BS);
2717
2718 /// List of users to ignore during scheduling and that don't need extracting.
2719 ArrayRef<Value *> UserIgnoreList;
2720
2721 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
2722 /// sorted SmallVectors of unsigned.
2723 struct OrdersTypeDenseMapInfo {
2724 static OrdersType getEmptyKey() {
2725 OrdersType V;
2726 V.push_back(~1U);
2727 return V;
2728 }
2729
2730 static OrdersType getTombstoneKey() {
2731 OrdersType V;
2732 V.push_back(~2U);
2733 return V;
2734 }
2735
2736 static unsigned getHashValue(const OrdersType &V) {
2737 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
2738 }
2739
2740 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
2741 return LHS == RHS;
2742 }
2743 };
2744
2745 // Analysis and block reference.
2746 Function *F;
2747 ScalarEvolution *SE;
2748 TargetTransformInfo *TTI;
2749 TargetLibraryInfo *TLI;
2750 AAResults *AA;
2751 LoopInfo *LI;
2752 DominatorTree *DT;
2753 AssumptionCache *AC;
2754 DemandedBits *DB;
2755 const DataLayout *DL;
2756 OptimizationRemarkEmitter *ORE;
2757
2758 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
2759 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
2760
2761 /// Instruction builder to construct the vectorized tree.
2762 IRBuilder<> Builder;
2763
2764 /// A map of scalar integer values to the smallest bit width with which they
2765 /// can legally be represented. The values map to (width, signed) pairs,
2766 /// where "width" indicates the minimum bit width and "signed" is True if the
2767 /// value must be signed-extended, rather than zero-extended, back to its
2768 /// original width.
2769 MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
2770};
2771
2772} // end namespace slpvectorizer
2773
2774template <> struct GraphTraits<BoUpSLP *> {
2775 using TreeEntry = BoUpSLP::TreeEntry;
2776
2777 /// NodeRef has to be a pointer per the GraphWriter.
2778 using NodeRef = TreeEntry *;
2779
2780 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
2781
2782 /// Add the VectorizableTree to the index iterator to be able to return
2783 /// TreeEntry pointers.
2784 struct ChildIteratorType
2785 : public iterator_adaptor_base<
2786 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
2787 ContainerTy &VectorizableTree;
2788
2789 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
2790 ContainerTy &VT)
2791 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
2792
2793 NodeRef operator*() { return I->UserTE; }
2794 };
2795
2796 static NodeRef getEntryNode(BoUpSLP &R) {
2797 return R.VectorizableTree[0].get();
2798 }
2799
2800 static ChildIteratorType child_begin(NodeRef N) {
2801 return {N->UserTreeIndices.begin(), N->Container};
2802 }
2803
2804 static ChildIteratorType child_end(NodeRef N) {
2805 return {N->UserTreeIndices.end(), N->Container};
2806 }
2807
2808 /// For the node iterator we just need to turn the TreeEntry iterator into a
2809 /// TreeEntry* iterator so that it dereferences to NodeRef.
2810 class nodes_iterator {
2811 using ItTy = ContainerTy::iterator;
2812 ItTy It;
2813
2814 public:
2815 nodes_iterator(const ItTy &It2) : It(It2) {}
2816 NodeRef operator*() { return It->get(); }
2817 nodes_iterator operator++() {
2818 ++It;
2819 return *this;
2820 }
2821 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
2822 };
2823
2824 static nodes_iterator nodes_begin(BoUpSLP *R) {
2825 return nodes_iterator(R->VectorizableTree.begin());
2826 }
2827
2828 static nodes_iterator nodes_end(BoUpSLP *R) {
2829 return nodes_iterator(R->VectorizableTree.end());
2830 }
2831
2832 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
2833};
2834
2835template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
2836 using TreeEntry = BoUpSLP::TreeEntry;
2837
2838 DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
2839
2840 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
2841 std::string Str;
2842 raw_string_ostream OS(Str);
2843 if (isSplat(Entry->Scalars))
2844 OS << "<splat> ";
2845 for (auto V : Entry->Scalars) {
2846 OS << *V;
2847 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
2848 return EU.Scalar == V;
2849 }))
2850 OS << " <extract>";
2851 OS << "\n";
2852 }
2853 return Str;
2854 }
2855
2856 static std::string getNodeAttributes(const TreeEntry *Entry,
2857 const BoUpSLP *) {
2858 if (Entry->State == TreeEntry::NeedToGather)
2859 return "color=red";
2860 return "";
2861 }
2862};
2863
2864} // end namespace llvm
2865
2866BoUpSLP::~BoUpSLP() {
2867 for (const auto &Pair : DeletedInstructions) {
2868 // Replace operands of ignored instructions with Undefs in case if they were
2869 // marked for deletion.
2870 if (Pair.getSecond()) {
2871 Value *Undef = UndefValue::get(Pair.getFirst()->getType());
2872 Pair.getFirst()->replaceAllUsesWith(Undef);
2873 }
2874 Pair.getFirst()->dropAllReferences();
2875 }
2876 for (const auto &Pair : DeletedInstructions) {
2877 assert(Pair.getFirst()->use_empty() &&(static_cast <bool> (Pair.getFirst()->use_empty() &&
"trying to erase instruction with users.") ? void (0) : __assert_fail
("Pair.getFirst()->use_empty() && \"trying to erase instruction with users.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2878, __extension__
__PRETTY_FUNCTION__))
2878 "trying to erase instruction with users.")(static_cast <bool> (Pair.getFirst()->use_empty() &&
"trying to erase instruction with users.") ? void (0) : __assert_fail
("Pair.getFirst()->use_empty() && \"trying to erase instruction with users.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2878, __extension__
__PRETTY_FUNCTION__))
;
2879 Pair.getFirst()->eraseFromParent();
2880 }
2881#ifdef EXPENSIVE_CHECKS
2882 // If we could guarantee that this call is not extremely slow, we could
2883 // remove the ifdef limitation (see PR47712).
2884 assert(!verifyFunction(*F, &dbgs()))(static_cast <bool> (!verifyFunction(*F, &dbgs())) ?
void (0) : __assert_fail ("!verifyFunction(*F, &dbgs())"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2884, __extension__
__PRETTY_FUNCTION__))
;
2885#endif
2886}
2887
2888void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
2889 for (auto *V : AV) {
2890 if (auto *I = dyn_cast<Instruction>(V))
2891 eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
2892 };
2893}
2894
2895/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
2896/// contains original mask for the scalars reused in the node. Procedure
2897/// transform this mask in accordance with the given \p Mask.
2898static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
2899 assert(!Mask.empty() && Reuses.size() == Mask.size() &&(static_cast <bool> (!Mask.empty() && Reuses.size
() == Mask.size() && "Expected non-empty mask.") ? void
(0) : __assert_fail ("!Mask.empty() && Reuses.size() == Mask.size() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2900, __extension__
__PRETTY_FUNCTION__))
2900 "Expected non-empty mask.")(static_cast <bool> (!Mask.empty() && Reuses.size
() == Mask.size() && "Expected non-empty mask.") ? void
(0) : __assert_fail ("!Mask.empty() && Reuses.size() == Mask.size() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2900, __extension__
__PRETTY_FUNCTION__))
;
2901 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
2902 Prev.swap(Reuses);
2903 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
2904 if (Mask[I] != UndefMaskElem)
2905 Reuses[Mask[I]] = Prev[I];
2906}
2907
2908/// Reorders the given \p Order according to the given \p Mask. \p Order - is
2909/// the original order of the scalars. Procedure transforms the provided order
2910/// in accordance with the given \p Mask. If the resulting \p Order is just an
2911/// identity order, \p Order is cleared.
2912static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
2913 assert(!Mask.empty() && "Expected non-empty mask.")(static_cast <bool> (!Mask.empty() && "Expected non-empty mask."
) ? void (0) : __assert_fail ("!Mask.empty() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2913, __extension__
__PRETTY_FUNCTION__))
;
2914 SmallVector<int> MaskOrder;
2915 if (Order.empty()) {
2916 MaskOrder.resize(Mask.size());
2917 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
2918 } else {
2919 inversePermutation(Order, MaskOrder);
2920 }
2921 reorderReuses(MaskOrder, Mask);
2922 if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {
2923 Order.clear();
2924 return;
2925 }
2926 Order.assign(Mask.size(), Mask.size());
2927 for (unsigned I = 0, E = Mask.size(); I < E; ++I)
2928 if (MaskOrder[I] != UndefMaskElem)
2929 Order[MaskOrder[I]] = I;
2930 fixupOrderingIndices(Order);
2931}
2932
2933Optional<BoUpSLP::OrdersType>
2934BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
2935 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.")(static_cast <bool> (TE.State == TreeEntry::NeedToGather
&& "Expected gather node only.") ? void (0) : __assert_fail
("TE.State == TreeEntry::NeedToGather && \"Expected gather node only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2935, __extension__
__PRETTY_FUNCTION__))
;
2936 unsigned NumScalars = TE.Scalars.size();
2937 OrdersType CurrentOrder(NumScalars, NumScalars);
2938 SmallVector<int> Positions;
2939 SmallBitVector UsedPositions(NumScalars);
2940 const TreeEntry *STE = nullptr;
2941 // Try to find all gathered scalars that are gets vectorized in other
2942 // vectorize node. Here we can have only one single tree vector node to
2943 // correctly identify order of the gathered scalars.
2944 for (unsigned I = 0; I < NumScalars; ++I) {
2945 Value *V = TE.Scalars[I];
2946 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
2947 continue;
2948 if (const auto *LocalSTE = getTreeEntry(V)) {
2949 if (!STE)
2950 STE = LocalSTE;
2951 else if (STE != LocalSTE)
2952 // Take the order only from the single vector node.
2953 return None;
2954 unsigned Lane =
2955 std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
2956 if (Lane >= NumScalars)
2957 return None;
2958 if (CurrentOrder[Lane] != NumScalars) {
2959 if (Lane != I)
2960 continue;
2961 UsedPositions.reset(CurrentOrder[Lane]);
2962 }
2963 // The partial identity (where only some elements of the gather node are
2964 // in the identity order) is good.
2965 CurrentOrder[Lane] = I;
2966 UsedPositions.set(I);
2967 }
2968 }
2969 // Need to keep the order if we have a vector entry and at least 2 scalars or
2970 // the vectorized entry has just 2 scalars.
2971 if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
2972 auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
2973 for (unsigned I = 0; I < NumScalars; ++I)
2974 if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
2975 return false;
2976 return true;
2977 };
2978 if (IsIdentityOrder(CurrentOrder)) {
2979 CurrentOrder.clear();
2980 return CurrentOrder;
2981 }
2982 auto *It = CurrentOrder.begin();
2983 for (unsigned I = 0; I < NumScalars;) {
2984 if (UsedPositions.test(I)) {
2985 ++I;
2986 continue;
2987 }
2988 if (*It == NumScalars) {
2989 *It = I;
2990 ++I;
2991 }
2992 ++It;
2993 }
2994 return CurrentOrder;
2995 }
2996 return None;
2997}
2998
2999Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
3000 bool TopToBottom) {
3001 // No need to reorder if need to shuffle reuses, still need to shuffle the
3002 // node.
3003 if (!TE.ReuseShuffleIndices.empty())
3004 return None;
3005 if (TE.State == TreeEntry::Vectorize &&
3006 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
3007 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
3008 !TE.isAltShuffle())
3009 return TE.ReorderIndices;
3010 if (TE.State == TreeEntry::NeedToGather) {
3011 // TODO: add analysis of other gather nodes with extractelement
3012 // instructions and other values/instructions, not only undefs.
3013 if (((TE.getOpcode() == Instruction::ExtractElement &&
3014 !TE.isAltShuffle()) ||
3015 (all_of(TE.Scalars,
3016 [](Value *V) {
3017 return isa<UndefValue, ExtractElementInst>(V);
3018 }) &&
3019 any_of(TE.Scalars,
3020 [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
3021 all_of(TE.Scalars,
3022 [](Value *V) {
3023 auto *EE = dyn_cast<ExtractElementInst>(V);
3024 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
3025 }) &&
3026 allSameType(TE.Scalars)) {
3027 // Check that gather of extractelements can be represented as
3028 // just a shuffle of a single vector.
3029 OrdersType CurrentOrder;
3030 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
3031 if (Reuse || !CurrentOrder.empty()) {
3032 if (!CurrentOrder.empty())
3033 fixupOrderingIndices(CurrentOrder);
3034 return CurrentOrder;
3035 }
3036 }
3037 if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
3038 return CurrentOrder;
3039 }
3040 return None;
3041}
3042
3043void BoUpSLP::reorderTopToBottom() {
3044 // Maps VF to the graph nodes.
3045 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
3046 // ExtractElement gather nodes which can be vectorized and need to handle
3047 // their ordering.
3048 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
3049 // Find all reorderable nodes with the given VF.
3050 // Currently the are vectorized stores,loads,extracts + some gathering of
3051 // extracts.
3052 for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
3053 const std::unique_ptr<TreeEntry> &TE) {
3054 if (Optional<OrdersType> CurrentOrder =
3055 getReorderingData(*TE.get(), /*TopToBottom=*/true)) {
3056 // Do not include ordering for nodes used in the alt opcode vectorization,
3057 // better to reorder them during bottom-to-top stage. If follow the order
3058 // here, it causes reordering of the whole graph though actually it is
3059 // profitable just to reorder the subgraph that starts from the alternate
3060 // opcode vectorization node. Such nodes already end-up with the shuffle
3061 // instruction and it is just enough to change this shuffle rather than
3062 // rotate the scalars for the whole graph.
3063 unsigned Cnt = 0;
3064 const TreeEntry *UserTE = TE.get();
3065 while (UserTE && Cnt < RecursionMaxDepth) {
3066 if (UserTE->UserTreeIndices.size() != 1)
3067 break;
3068 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
3069 return EI.UserTE->State == TreeEntry::Vectorize &&
3070 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
3071 }))
3072 return;
3073 if (UserTE->UserTreeIndices.empty())
3074 UserTE = nullptr;
3075 else
3076 UserTE = UserTE->UserTreeIndices.back().UserTE;
3077 ++Cnt;
3078 }
3079 VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
3080 if (TE->State != TreeEntry::Vectorize)
3081 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
3082 }
3083 });
3084
3085 // Reorder the graph nodes according to their vectorization factor.
3086 for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;
3087 VF /= 2) {
3088 auto It = VFToOrderedEntries.find(VF);
3089 if (It == VFToOrderedEntries.end())
3090 continue;
3091 // Try to find the most profitable order. We just are looking for the most
3092 // used order and reorder scalar elements in the nodes according to this
3093 // mostly used order.
3094 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
3095 // All operands are reordered and used only in this node - propagate the
3096 // most used order to the user node.
3097 MapVector<OrdersType, unsigned,
3098 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
3099 OrdersUses;
3100 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
3101 for (const TreeEntry *OpTE : OrderedEntries) {
3102 // No need to reorder this nodes, still need to extend and to use shuffle,
3103 // just need to merge reordering shuffle and the reuse shuffle.
3104 if (!OpTE->ReuseShuffleIndices.empty())
3105 continue;
3106 // Count number of orders uses.
3107 const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
3108 if (OpTE->State == TreeEntry::NeedToGather)
3109 return GathersToOrders.find(OpTE)->second;
3110 return OpTE->ReorderIndices;
3111 }();
3112 // Stores actually store the mask, not the order, need to invert.
3113 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
3114 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
3115 SmallVector<int> Mask;
3116 inversePermutation(Order, Mask);
3117 unsigned E = Order.size();
3118 OrdersType CurrentOrder(E, E);
3119 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
3120 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
3121 });
3122 fixupOrderingIndices(CurrentOrder);
3123 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
3124 } else {
3125 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
3126 }
3127 }
3128 // Set order of the user node.
3129 if (OrdersUses.empty())
3130 continue;
3131 // Choose the most used order.
3132 ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
3133 unsigned Cnt = OrdersUses.front().second;
3134 for (const auto &Pair : drop_begin(OrdersUses)) {
3135 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
3136 BestOrder = Pair.first;
3137 Cnt = Pair.second;
3138 }
3139 }
3140 // Set order of the user node.
3141 if (BestOrder.empty())
3142 continue;
3143 SmallVector<int> Mask;
3144 inversePermutation(BestOrder, Mask);
3145 SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
3146 unsigned E = BestOrder.size();
3147 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
3148 return I < E ? static_cast<int>(I) : UndefMaskElem;
3149 });
3150 // Do an actual reordering, if profitable.
3151 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
3152 // Just do the reordering for the nodes with the given VF.
3153 if (TE->Scalars.size() != VF) {
3154 if (TE->ReuseShuffleIndices.size() == VF) {
3155 // Need to reorder the reuses masks of the operands with smaller VF to
3156 // be able to find the match between the graph nodes and scalar
3157 // operands of the given node during vectorization/cost estimation.
3158 assert(all_of(TE->UserTreeIndices,(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3164, __extension__
__PRETTY_FUNCTION__))
3159 [VF, &TE](const EdgeInfo &EI) {(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3164, __extension__
__PRETTY_FUNCTION__))
3160 return EI.UserTE->Scalars.size() == VF ||(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3164, __extension__
__PRETTY_FUNCTION__))
3161 EI.UserTE->Scalars.size() ==(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3164, __extension__
__PRETTY_FUNCTION__))
3162 TE->Scalars.size();(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3164, __extension__
__PRETTY_FUNCTION__))
3163 }) &&(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3164, __extension__
__PRETTY_FUNCTION__))
3164 "All users must be of VF size.")(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3164, __extension__
__PRETTY_FUNCTION__))
;
3165 // Update ordering of the operands with the smaller VF than the given
3166 // one.
3167 reorderReuses(TE->ReuseShuffleIndices, Mask);
3168 }
3169 continue;
3170 }
3171 if (TE->State == TreeEntry::Vectorize &&
3172 isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
3173 InsertElementInst>(TE->getMainOp()) &&
3174 !TE->isAltShuffle()) {
3175 // Build correct orders for extract{element,value}, loads and
3176 // stores.
3177 reorderOrder(TE->ReorderIndices, Mask);
3178 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
3179 TE->reorderOperands(Mask);
3180 } else {
3181 // Reorder the node and its operands.
3182 TE->reorderOperands(Mask);
3183 assert(TE->ReorderIndices.empty() &&(static_cast <bool> (TE->ReorderIndices.empty() &&
"Expected empty reorder sequence.") ? void (0) : __assert_fail
("TE->ReorderIndices.empty() && \"Expected empty reorder sequence.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3184, __extension__
__PRETTY_FUNCTION__))
3184 "Expected empty reorder sequence.")(static_cast <bool> (TE->ReorderIndices.empty() &&
"Expected empty reorder sequence.") ? void (0) : __assert_fail
("TE->ReorderIndices.empty() && \"Expected empty reorder sequence.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3184, __extension__
__PRETTY_FUNCTION__))
;
3185 reorderScalars(TE->Scalars, Mask);
3186 }
3187 if (!TE->ReuseShuffleIndices.empty()) {
3188 // Apply reversed order to keep the original ordering of the reused
3189 // elements to avoid extra reorder indices shuffling.
3190 OrdersType CurrentOrder;
3191 reorderOrder(CurrentOrder, MaskOrder);
3192 SmallVector<int> NewReuses;
3193 inversePermutation(CurrentOrder, NewReuses);
3194 addMask(NewReuses, TE->ReuseShuffleIndices);
3195 TE->ReuseShuffleIndices.swap(NewReuses);
3196 }
3197 }
3198 }
3199}
3200
3201void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
3202 SetVector<TreeEntry *> OrderedEntries;
3203 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
3204 // Find all reorderable leaf nodes with the given VF.
3205 // Currently the are vectorized loads,extracts without alternate operands +
3206 // some gathering of extracts.
3207 SmallVector<TreeEntry *> NonVectorized;
3208 for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,
3209 &NonVectorized](
3210 const std::unique_ptr<TreeEntry> &TE) {
3211 if (TE->State != TreeEntry::Vectorize)
3212 NonVectorized.push_back(TE.get());
3213 if (Optional<OrdersType> CurrentOrder =
3214 getReorderingData(*TE.get(), /*TopToBottom=*/false)) {
3215 OrderedEntries.insert(TE.get());
3216 if (TE->State != TreeEntry::Vectorize)
3217 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
3218 }
3219 });
3220
3221 // Checks if the operands of the users are reordarable and have only single
3222 // use.
3223 auto &&CheckOperands =
3224 [this, &NonVectorized](const auto &Data,
3225 SmallVectorImpl<TreeEntry *> &GatherOps) {
3226 for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {
3227 if (any_of(Data.second,
3228 [I](const std::pair<unsigned, TreeEntry *> &OpData) {
3229 return OpData.first == I &&
3230 OpData.second->State == TreeEntry::Vectorize;
3231 }))
3232 continue;
3233 ArrayRef<Value *> VL = Data.first->getOperand(I);
3234 const TreeEntry *TE = nullptr;
3235 const auto *It = find_if(VL, [this, &TE](Value *V) {
3236 TE = getTreeEntry(V);
3237 return TE;
3238 });
3239 if (It != VL.end() && TE->isSame(VL))
3240 return false;
3241 TreeEntry *Gather = nullptr;
3242 if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {
3243 assert(TE->State != TreeEntry::Vectorize &&(static_cast <bool> (TE->State != TreeEntry::Vectorize
&& "Only non-vectorized nodes are expected.") ? void
(0) : __assert_fail ("TE->State != TreeEntry::Vectorize && \"Only non-vectorized nodes are expected.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3244, __extension__
__PRETTY_FUNCTION__))
3244 "Only non-vectorized nodes are expected.")(static_cast <bool> (TE->State != TreeEntry::Vectorize
&& "Only non-vectorized nodes are expected.") ? void
(0) : __assert_fail ("TE->State != TreeEntry::Vectorize && \"Only non-vectorized nodes are expected.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3244, __extension__
__PRETTY_FUNCTION__))
;
3245 if (TE->isSame(VL)) {
3246 Gather = TE;
3247 return true;
3248 }
3249 return false;
3250 }) > 1)
3251 return false;
3252 if (Gather)
3253 GatherOps.push_back(Gather);
3254 }
3255 return true;
3256 };
3257 // 1. Propagate order to the graph nodes, which use only reordered nodes.
3258 // I.e., if the node has operands, that are reordered, try to make at least
3259 // one operand order in the natural order and reorder others + reorder the
3260 // user node itself.
3261 SmallPtrSet<const TreeEntry *, 4> Visited;
3262 while (!OrderedEntries.empty()) {
3263 // 1. Filter out only reordered nodes.
3264 // 2. If the entry has multiple uses - skip it and jump to the next node.
3265 MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
3266 SmallVector<TreeEntry *> Filtered;
3267 for (TreeEntry *TE : OrderedEntries) {
3268 if (!(TE->State == TreeEntry::Vectorize ||
3269 (TE->State == TreeEntry::NeedToGather &&
3270 GathersToOrders.count(TE))) ||
3271 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
3272 !all_of(drop_begin(TE->UserTreeIndices),
3273 [TE](const EdgeInfo &EI) {
3274 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
3275 }) ||
3276 !Visited.insert(TE).second) {
3277 Filtered.push_back(TE);
3278 continue;
3279 }
3280 // Build a map between user nodes and their operands order to speedup
3281 // search. The graph currently does not provide this dependency directly.
3282 for (EdgeInfo &EI : TE->UserTreeIndices) {
3283 TreeEntry *UserTE = EI.UserTE;
3284 auto It = Users.find(UserTE);
3285 if (It == Users.end())
3286 It = Users.insert({UserTE, {}}).first;
3287 It->second.emplace_back(EI.EdgeIdx, TE);
3288 }
3289 }
3290 // Erase filtered entries.
3291 for_each(Filtered,
3292 [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
3293 for (const auto &Data : Users) {
3294 // Check that operands are used only in the User node.
3295 SmallVector<TreeEntry *> GatherOps;
3296 if (!CheckOperands(Data, GatherOps)) {
3297 for_each(Data.second,
3298 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
3299 OrderedEntries.remove(Op.second);
3300 });
3301 continue;
3302 }
3303 // All operands are reordered and used only in this node - propagate the
3304 // most used order to the user node.
3305 MapVector<OrdersType, unsigned,
3306 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
3307 OrdersUses;
3308 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
3309 for (const auto &Op : Data.second) {
3310 TreeEntry *OpTE = Op.second;
3311 if (!OpTE->ReuseShuffleIndices.empty() ||
3312 (IgnoreReorder && OpTE == VectorizableTree.front().get()))
3313 continue;
3314 const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
3315 if (OpTE->State == TreeEntry::NeedToGather)
3316 return GathersToOrders.find(OpTE)->second;
3317 return OpTE->ReorderIndices;
3318 }();
3319 // Stores actually store the mask, not the order, need to invert.
3320 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
3321 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
3322 SmallVector<int> Mask;
3323 inversePermutation(Order, Mask);
3324 unsigned E = Order.size();
3325 OrdersType CurrentOrder(E, E);
3326 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
3327 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
3328 });
3329 fixupOrderingIndices(CurrentOrder);
3330 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
3331 } else {
3332 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
3333 }
3334 if (VisitedOps.insert(OpTE).second)
3335 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
3336 OpTE->UserTreeIndices.size();
3337 assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.")(static_cast <bool> (OrdersUses[{}] > 0 && "Counter cannot be less than 0."
) ? void (0) : __assert_fail ("OrdersUses[{}] > 0 && \"Counter cannot be less than 0.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3337, __extension__
__PRETTY_FUNCTION__))
;
3338 --OrdersUses[{}];
3339 }
3340 // If no orders - skip current nodes and jump to the next one, if any.
3341 if (OrdersUses.empty()) {
3342 for_each(Data.second,
3343 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
3344 OrderedEntries.remove(Op.second);
3345 });
3346 continue;
3347 }
3348 // Choose the best order.
3349 ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
3350 unsigned Cnt = OrdersUses.front().second;
3351 for (const auto &Pair : drop_begin(OrdersUses)) {
3352 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
3353 BestOrder = Pair.first;
3354 Cnt = Pair.second;
3355 }
3356 }
3357 // Set order of the user node (reordering of operands and user nodes).
3358 if (BestOrder.empty()) {
3359 for_each(Data.second,
3360 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
3361 OrderedEntries.remove(Op.second);
3362 });
3363 continue;
3364 }
3365 // Erase operands from OrderedEntries list and adjust their orders.
3366 VisitedOps.clear();
3367 SmallVector<int> Mask;
3368 inversePermutation(BestOrder, Mask);
3369 SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
3370 unsigned E = BestOrder.size();
3371 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
3372 return I < E ? static_cast<int>(I) : UndefMaskElem;
3373 });
3374 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
3375 TreeEntry *TE = Op.second;
3376 OrderedEntries.remove(TE);
3377 if (!VisitedOps.insert(TE).second)
3378 continue;
3379 if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {
3380 // Just reorder reuses indices.
3381 reorderReuses(TE->ReuseShuffleIndices, Mask);
3382 continue;
3383 }
3384 // Gathers are processed separately.
3385 if (TE->State != TreeEntry::Vectorize)
3386 continue;
3387 assert((BestOrder.size() == TE->ReorderIndices.size() ||(static_cast <bool> ((BestOrder.size() == TE->ReorderIndices
.size() || TE->ReorderIndices.empty()) && "Non-matching sizes of user/operand entries."
) ? void (0) : __assert_fail ("(BestOrder.size() == TE->ReorderIndices.size() || TE->ReorderIndices.empty()) && \"Non-matching sizes of user/operand entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3389, __extension__
__PRETTY_FUNCTION__))
3388 TE->ReorderIndices.empty()) &&(static_cast <bool> ((BestOrder.size() == TE->ReorderIndices
.size() || TE->ReorderIndices.empty()) && "Non-matching sizes of user/operand entries."
) ? void (0) : __assert_fail ("(BestOrder.size() == TE->ReorderIndices.size() || TE->ReorderIndices.empty()) && \"Non-matching sizes of user/operand entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3389, __extension__
__PRETTY_FUNCTION__))
3389 "Non-matching sizes of user/operand entries.")(static_cast <bool> ((BestOrder.size() == TE->ReorderIndices
.size() || TE->ReorderIndices.empty()) && "Non-matching sizes of user/operand entries."
) ? void (0) : __assert_fail ("(BestOrder.size() == TE->ReorderIndices.size() || TE->ReorderIndices.empty()) && \"Non-matching sizes of user/operand entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3389, __extension__
__PRETTY_FUNCTION__))
;
3390 reorderOrder(TE->ReorderIndices, Mask);
3391 }
3392 // For gathers just need to reorder its scalars.
3393 for (TreeEntry *Gather : GatherOps) {
3394 assert(Gather->ReorderIndices.empty() &&(static_cast <bool> (Gather->ReorderIndices.empty() &&
"Unexpected reordering of gathers.") ? void (0) : __assert_fail
("Gather->ReorderIndices.empty() && \"Unexpected reordering of gathers.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3395, __extension__
__PRETTY_FUNCTION__))
3395 "Unexpected reordering of gathers.")(static_cast <bool> (Gather->ReorderIndices.empty() &&
"Unexpected reordering of gathers.") ? void (0) : __assert_fail
("Gather->ReorderIndices.empty() && \"Unexpected reordering of gathers.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3395, __extension__
__PRETTY_FUNCTION__))
;
3396 if (!Gather->ReuseShuffleIndices.empty()) {
3397 // Just reorder reuses indices.
3398 reorderReuses(Gather->ReuseShuffleIndices, Mask);
3399 continue;
3400 }
3401 reorderScalars(Gather->Scalars, Mask);
3402 OrderedEntries.remove(Gather);
3403 }
3404 // Reorder operands of the user node and set the ordering for the user
3405 // node itself.
3406 if (Data.first->State != TreeEntry::Vectorize ||
3407 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
3408 Data.first->getMainOp()) ||
3409 Data.first->isAltShuffle())
3410 Data.first->reorderOperands(Mask);
3411 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
3412 Data.first->isAltShuffle()) {
3413 reorderScalars(Data.first->Scalars, Mask);
3414 reorderOrder(Data.first->ReorderIndices, MaskOrder);
3415 if (Data.first->ReuseShuffleIndices.empty() &&
3416 !Data.first->ReorderIndices.empty() &&
3417 !Data.first->isAltShuffle()) {
3418 // Insert user node to the list to try to sink reordering deeper in
3419 // the graph.
3420 OrderedEntries.insert(Data.first);
3421 }
3422 } else {
3423 reorderOrder(Data.first->ReorderIndices, Mask);
3424 }
3425 }
3426 }
3427 // If the reordering is unnecessary, just remove the reorder.
3428 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
3429 VectorizableTree.front()->ReuseShuffleIndices.empty())
3430 VectorizableTree.front()->ReorderIndices.clear();
3431}
3432
3433void BoUpSLP::buildExternalUses(
3434 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
3435 // Collect the values that we need to extract from the tree.
3436 for (auto &TEPtr : VectorizableTree) {
3437 TreeEntry *Entry = TEPtr.get();
3438
3439 // No need to handle users of gathered values.
3440 if (Entry->State == TreeEntry::NeedToGather)
3441 continue;
3442
3443 // For each lane:
3444 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
3445 Value *Scalar = Entry->Scalars[Lane];
3446 int FoundLane = Entry->findLaneForValue(Scalar);
3447
3448 // Check if the scalar is externally used as an extra arg.
3449 auto ExtI = ExternallyUsedValues.find(Scalar);
3450 if (ExtI != ExternallyUsedValues.end()) {
3451 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)
3452 << Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)
;
3453 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
3454 }
3455 for (User *U : Scalar->users()) {
3456 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Checking user:" << *U <<
".\n"; } } while (false)
;
3457
3458 Instruction *UserInst = dyn_cast<Instruction>(U);
3459 if (!UserInst)
3460 continue;
3461
3462 if (isDeleted(UserInst))
3463 continue;
3464
3465 // Skip in-tree scalars that become vectors
3466 if (TreeEntry *UseEntry = getTreeEntry(U)) {
3467 Value *UseScalar = UseEntry->Scalars[0];
3468 // Some in-tree scalars will remain as scalar in vectorized
3469 // instructions. If that is the case, the one in Lane 0 will
3470 // be used.
3471 if (UseScalar != U ||
3472 UseEntry->State == TreeEntry::ScatterVectorize ||
3473 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
3474 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *Udo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)
3475 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)
;
3476 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state")(static_cast <bool> (UseEntry->State != TreeEntry::NeedToGather
&& "Bad state") ? void (0) : __assert_fail ("UseEntry->State != TreeEntry::NeedToGather && \"Bad state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3476, __extension__
__PRETTY_FUNCTION__))
;
3477 continue;
3478 }
3479 }
3480
3481 // Ignore users in the user ignore list.
3482 if (is_contained(UserIgnoreList, UserInst))
3483 continue;
3484
3485 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)
3486 << Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)
;
3487 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
3488 }
3489 }
3490 }
3491}
3492
3493void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
3494 ArrayRef<Value *> UserIgnoreLst) {
3495 deleteTree();
3496 UserIgnoreList = UserIgnoreLst;
3497 if (!allSameType(Roots))
3498 return;
3499 buildTree_rec(Roots, 0, EdgeInfo());
3500}
3501
3502namespace {
3503/// Tracks the state we can represent the loads in the given sequence.
3504enum class LoadsState { Gather, Vectorize, ScatterVectorize };
3505} // anonymous namespace
3506
3507/// Checks if the given array of loads can be represented as a vectorized,
3508/// scatter or just simple gather.
3509static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
3510 const TargetTransformInfo &TTI,
3511 const DataLayout &DL, ScalarEvolution &SE,
3512 SmallVectorImpl<unsigned> &Order,
3513 SmallVectorImpl<Value *> &PointerOps) {
3514 // Check that a vectorized load would load the same memory as a scalar
3515 // load. For example, we don't want to vectorize loads that are smaller
3516 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
3517 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
3518 // from such a struct, we read/write packed bits disagreeing with the
3519 // unvectorized version.
3520 Type *ScalarTy = VL0->getType();
3521
3522 if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
3523 return LoadsState::Gather;
3524
3525 // Make sure all loads in the bundle are simple - we can't vectorize
3526 // atomic or volatile loads.
3527 PointerOps.clear();
3528 PointerOps.resize(VL.size());
3529 auto *POIter = PointerOps.begin();
3530 for (Value *V : VL) {
3531 auto *L = cast<LoadInst>(V);
3532 if (!L->isSimple())
3533 return LoadsState::Gather;
3534 *POIter = L->getPointerOperand();
3535 ++POIter;
3536 }
3537
3538 Order.clear();
3539 // Check the order of pointer operands.
3540 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) {
3541 Value *Ptr0;
3542 Value *PtrN;
3543 if (Order.empty()) {
3544 Ptr0 = PointerOps.front();
3545 PtrN = PointerOps.back();
3546 } else {
3547 Ptr0 = PointerOps[Order.front()];
3548 PtrN = PointerOps[Order.back()];
3549 }
3550 Optional<int> Diff =
3551 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
3552 // Check that the sorted loads are consecutive.
3553 if (static_cast<unsigned>(*Diff) == VL.size() - 1)
3554 return LoadsState::Vectorize;
3555 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3556 for (Value *V : VL)
3557 CommonAlignment =
3558 commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
3559 if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
3560 CommonAlignment))
3561 return LoadsState::ScatterVectorize;
3562 }
3563
3564 return LoadsState::Gather;
3565}
3566
3567void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
3568 const EdgeInfo &UserTreeIdx) {
3569 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!")(static_cast <bool> ((allConstant(VL) || allSameType(VL
)) && "Invalid types!") ? void (0) : __assert_fail ("(allConstant(VL) || allSameType(VL)) && \"Invalid types!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3569, __extension__
__PRETTY_FUNCTION__))
;
3570
3571 SmallVector<int> ReuseShuffleIndicies;
3572 SmallVector<Value *> UniqueValues;
3573 auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
3574 &UserTreeIdx,
3575 this](const InstructionsState &S) {
3576 // Check that every instruction appears once in this bundle.
3577 DenseMap<Value *, unsigned> UniquePositions;
3578 for (Value *V : VL) {
3579 if (isConstant(V)) {
3580 ReuseShuffleIndicies.emplace_back(
3581 isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());
3582 UniqueValues.emplace_back(V);
3583 continue;
3584 }
3585 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
3586 ReuseShuffleIndicies.emplace_back(Res.first->second);
3587 if (Res.second)
3588 UniqueValues.emplace_back(V);
3589 }
3590 size_t NumUniqueScalarValues = UniqueValues.size();
3591 if (NumUniqueScalarValues == VL.size()) {
3592 ReuseShuffleIndicies.clear();
3593 } else {
3594 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Shuffle for reused scalars.\n"
; } } while (false)
;
3595 if (NumUniqueScalarValues <= 1 ||
3596 (UniquePositions.size() == 1 && all_of(UniqueValues,
3597 [](Value *V) {
3598 return isa<UndefValue>(V) ||
3599 !isConstant(V);
3600 })) ||
3601 !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
3602 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Scalar used twice in bundle.\n"
; } } while (false)
;
3603 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3604 return false;
3605 }
3606 VL = UniqueValues;
3607 }
3608 return true;
3609 };
3610
3611 InstructionsState S = getSameOpcode(VL);
3612 if (Depth == RecursionMaxDepth) {
3613 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to max recursion depth.\n"
; } } while (false)
;
3614 if (TryToFindDuplicates(S))
3615 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3616 ReuseShuffleIndicies);
3617 return;
3618 }
3619
3620 // Don't handle scalable vectors
3621 if (S.getOpcode() == Instruction::ExtractElement &&
3622 isa<ScalableVectorType>(
3623 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
3624 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to scalable vector type.\n"
; } } while (false)
;
3625 if (TryToFindDuplicates(S))
3626 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3627 ReuseShuffleIndicies);
3628 return;
3629 }
3630
3631 // Don't handle vectors.
3632 if (S.OpValue->getType()->isVectorTy() &&
3633 !isa<InsertElementInst>(S.OpValue)) {
3634 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to vector type.\n"
; } } while (false)
;
3635 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3636 return;
3637 }
3638
3639 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
3640 if (SI->getValueOperand()->getType()->isVectorTy()) {
3641 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to store vector type.\n"
; } } while (false)
;
3642 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3643 return;
3644 }
3645
3646 // If all of the operands are identical or constant we have a simple solution.
3647 // If we deal with insert/extract instructions, they all must have constant
3648 // indices, otherwise we should gather them, not try to vectorize.
3649 if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||
3650 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
3651 !all_of(VL, isVectorLikeInstWithConstOps))) {
3652 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to C,S,B,O. \n"
; } } while (false)
;
3653 if (TryToFindDuplicates(S))
3654 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3655 ReuseShuffleIndicies);
3656 return;
3657 }
3658
3659 // We now know that this is a vector of instructions of the same type from
3660 // the same block.
3661
3662 // Don't vectorize ephemeral values.
3663 for (Value *V : VL) {
3664 if (EphValues.count(V)) {
3665 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is ephemeral.\n"; } } while (false)
3666 << ") is ephemeral.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is ephemeral.\n"; } } while (false)
;
3667 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3668 return;
3669 }
3670 }
3671
3672 // Check if this is a duplicate of another entry.
3673 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
3674 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tChecking bundle: " <<
*S.OpValue << ".\n"; } } while (false)
;
3675 if (!E->isSame(VL)) {
3676 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to partial overlap.\n"
; } } while (false)
;
3677 if (TryToFindDuplicates(S))
3678 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3679 ReuseShuffleIndicies);
3680 return;
3681 }
3682 // Record the reuse of the tree node. FIXME, currently this is only used to
3683 // properly draw the graph rather than for the actual vectorization.
3684 E->UserTreeIndices.push_back(UserTreeIdx);
3685 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValuedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*S.OpValue << ".\n"; } } while (false)
3686 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*S.OpValue << ".\n"; } } while (false)
;
3687 return;
3688 }
3689
3690 // Check that none of the instructions in the bundle are already in the tree.
3691 for (Value *V : VL) {
3692 auto *I = dyn_cast<Instruction>(V);
3693 if (!I)
3694 continue;
3695 if (getTreeEntry(I)) {
3696 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is already in tree.\n"; } } while (false)
3697 << ") is already in tree.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is already in tree.\n"; } } while (false)
;
3698 if (TryToFindDuplicates(S))
3699 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3700 ReuseShuffleIndicies);
3701 return;
3702 }
3703 }
3704
3705 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
3706 for (Value *V : VL) {
3707 if (is_contained(UserIgnoreList, V)) {
3708 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to gathered scalar.\n"
; } } while (false)
;
3709 if (TryToFindDuplicates(S))
3710 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3711 ReuseShuffleIndicies);
3712 return;
3713 }
3714 }
3715
3716 // Check that all of the users of the scalars that we want to vectorize are
3717 // schedulable.
3718 auto *VL0 = cast<Instruction>(S.OpValue);
3719 BasicBlock *BB = VL0->getParent();
3720
3721 if (!DT->isReachableFromEntry(BB)) {
3722 // Don't go into unreachable blocks. They may contain instructions with
3723 // dependency cycles which confuse the final scheduling.
3724 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle in unreachable block.\n"
; } } while (false)
;
3725 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3726 return;
3727 }
3728
3729 // Check that every instruction appears once in this bundle.
3730 if (!TryToFindDuplicates(S))
3731 return;
3732
3733 auto &BSRef = BlocksSchedules[BB];
3734 if (!BSRef)
3735 BSRef = std::make_unique<BlockScheduling>(BB);
3736
3737 BlockScheduling &BS = *BSRef.get();
3738
3739 Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
3740 if (!Bundle) {
3741 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are not able to schedule this bundle!\n"
; } } while (false)
;
3742 assert((!BS.getScheduleData(VL0) ||(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3744, __extension__
__PRETTY_FUNCTION__))
3743 !BS.getScheduleData(VL0)->isPartOfBundle()) &&(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3744, __extension__
__PRETTY_FUNCTION__))
3744 "tryScheduleBundle should cancelScheduling on failure")(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3744, __extension__
__PRETTY_FUNCTION__))
;
3745 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3746 ReuseShuffleIndicies);
3747 return;
3748 }
3749 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are able to schedule this bundle.\n"
; } } while (false)
;
3750
3751 unsigned ShuffleOrOp = S.isAltShuffle() ?
3752 (unsigned) Instruction::ShuffleVector : S.getOpcode();
3753 switch (ShuffleOrOp) {
3754 case Instruction::PHI: {
3755 auto *PH = cast<PHINode>(VL0);
3756
3757 // Check for terminator values (e.g. invoke).
3758 for (Value *V : VL)
3759 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
3760 Instruction *Term = dyn_cast<Instruction>(
3761 cast<PHINode>(V)->getIncomingValueForBlock(
3762 PH->getIncomingBlock(I)));
3763 if (Term && Term->isTerminator()) {
3764 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"
; } } while (false)
3765 << "SLP: Need to swizzle PHINodes (terminator use).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"
; } } while (false)
;
3766 BS.cancelScheduling(VL, VL0);
3767 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3768 ReuseShuffleIndicies);
3769 return;
3770 }
3771 }
3772
3773 TreeEntry *TE =
3774 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
3775 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of PHINodes.\n"
; } } while (false)
;
3776
3777 // Keeps the reordered operands to avoid code duplication.
3778 SmallVector<ValueList, 2> OperandsVec;
3779 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
3780 if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
3781 ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
3782 TE->setOperand(I, Operands);
3783 OperandsVec.push_back(Operands);
3784 continue;
3785 }
3786 ValueList Operands;
3787 // Prepare the operand vector.
3788 for (Value *V : VL)
3789 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
3790 PH->getIncomingBlock(I)));
3791 TE->setOperand(I, Operands);
3792 OperandsVec.push_back(Operands);
3793 }
3794 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
3795 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
3796 return;
3797 }
3798 case Instruction::ExtractValue:
3799 case Instruction::ExtractElement: {
3800 OrdersType CurrentOrder;
3801 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
3802 if (Reuse) {
3803 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Reusing or shuffling extract sequence.\n"
; } } while (false)
;
3804 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3805 ReuseShuffleIndicies);
3806 // This is a special case, as it does not gather, but at the same time
3807 // we are not extending buildTree_rec() towards the operands.
3808 ValueList Op0;
3809 Op0.assign(VL.size(), VL0->getOperand(0));
3810 VectorizableTree.back()->setOperand(0, Op0);
3811 return;
3812 }
3813 if (!CurrentOrder.empty()) {
3814 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
3815 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
3816 "with order";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
3817 for (unsigned Idx : CurrentOrder)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
3818 dbgs() << " " << Idx;do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
3819 dbgs() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
3820 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
;
3821 fixupOrderingIndices(CurrentOrder);
3822 // Insert new order with initial value 0, if it does not exist,
3823 // otherwise return the iterator to the existing one.
3824 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3825 ReuseShuffleIndicies, CurrentOrder);
3826 // This is a special case, as it does not gather, but at the same time
3827 // we are not extending buildTree_rec() towards the operands.
3828 ValueList Op0;
3829 Op0.assign(VL.size(), VL0->getOperand(0));
3830 VectorizableTree.back()->setOperand(0, Op0);
3831 return;
3832 }
3833 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gather extract sequence.\n";
} } while (false)
;
3834 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3835 ReuseShuffleIndicies);
3836 BS.cancelScheduling(VL, VL0);
3837 return;
3838 }
3839 case Instruction::InsertElement: {
3840 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique")(static_cast <bool> (ReuseShuffleIndicies.empty() &&
"All inserts should be unique") ? void (0) : __assert_fail (
"ReuseShuffleIndicies.empty() && \"All inserts should be unique\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3840, __extension__
__PRETTY_FUNCTION__))
;
3841
3842 // Check that we have a buildvector and not a shuffle of 2 or more
3843 // different vectors.
3844 ValueSet SourceVectors;
3845 int MinIdx = std::numeric_limits<int>::max();
3846 for (Value *V : VL) {
3847 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
3848 Optional<int> Idx = *getInsertIndex(V, 0);
3849 if (!Idx || *Idx == UndefMaskElem)
3850 continue;
3851 MinIdx = std::min(MinIdx, *Idx);
3852 }
3853
3854 if (count_if(VL, [&SourceVectors](Value *V) {
3855 return !SourceVectors.contains(V);
3856 }) >= 2) {
3857 // Found 2nd source vector - cancel.
3858 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gather of insertelement vectors with "
"different source vectors.\n"; } } while (false)
3859 "different source vectors.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gather of insertelement vectors with "
"different source vectors.\n"; } } while (false)
;
3860 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3861 BS.cancelScheduling(VL, VL0);
3862 return;
3863 }
3864
3865 auto OrdCompare = [](const std::pair<int, int> &P1,
3866 const std::pair<int, int> &P2) {
3867 return P1.first > P2.first;
3868 };
3869 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
3870 decltype(OrdCompare)>
3871 Indices(OrdCompare);
3872 for (int I = 0, E = VL.size(); I < E; ++I) {
3873 Optional<int> Idx = *getInsertIndex(VL[I], 0);
3874 if (!Idx || *Idx == UndefMaskElem)
3875 continue;
3876 Indices.emplace(*Idx, I);
3877 }
3878 OrdersType CurrentOrder(VL.size(), VL.size());
3879 bool IsIdentity = true;
3880 for (int I = 0, E = VL.size(); I < E; ++I) {
3881 CurrentOrder[Indices.top().second] = I;
3882 IsIdentity &= Indices.top().second == I;
3883 Indices.pop();
3884 }
3885 if (IsIdentity)
3886 CurrentOrder.clear();
3887 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3888 None, CurrentOrder);
3889 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added inserts bundle.\n"; } }
while (false)
;
3890
3891 constexpr int NumOps = 2;
3892 ValueList VectorOperands[NumOps];
3893 for (int I = 0; I < NumOps; ++I) {
3894 for (Value *V : VL)
3895 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
3896
3897 TE->setOperand(I, VectorOperands[I]);
3898 }
3899 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
3900 return;
3901 }
3902 case Instruction::Load: {
3903 // Check that a vectorized load would load the same memory as a scalar
3904 // load. For example, we don't want to vectorize loads that are smaller
3905 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
3906 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
3907 // from such a struct, we read/write packed bits disagreeing with the
3908 // unvectorized version.
3909 SmallVector<Value *> PointerOps;
3910 OrdersType CurrentOrder;
3911 TreeEntry *TE = nullptr;
3912 switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder,
3913 PointerOps)) {
3914 case LoadsState::Vectorize:
3915 if (CurrentOrder.empty()) {
3916 // Original loads are consecutive and does not require reordering.
3917 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3918 ReuseShuffleIndicies);
3919 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of loads.\n";
} } while (false)
;
3920 } else {
3921 fixupOrderingIndices(CurrentOrder);
3922 // Need to reorder.
3923 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3924 ReuseShuffleIndicies, CurrentOrder);
3925 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of jumbled loads.\n"
; } } while (false)
;
3926 }
3927 TE->setOperandsInOrder();
3928 break;
3929 case LoadsState::ScatterVectorize:
3930 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
3931 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
3932 UserTreeIdx, ReuseShuffleIndicies);
3933 TE->setOperandsInOrder();
3934 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
3935 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of non-consecutive loads.\n"
; } } while (false)
;
3936 break;
3937 case LoadsState::Gather:
3938 BS.cancelScheduling(VL, VL0);
3939 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3940 ReuseShuffleIndicies);
3941#ifndef NDEBUG
3942 Type *ScalarTy = VL0->getType();
3943 if (DL->getTypeSizeInBits(ScalarTy) !=
3944 DL->getTypeAllocSizeInBits(ScalarTy))
3945 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering loads of non-packed type.\n"
; } } while (false)
;
3946 else if (any_of(VL, [](Value *V) {
3947 return !cast<LoadInst>(V)->isSimple();
3948 }))
3949 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple loads.\n"
; } } while (false)
;
3950 else
3951 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-consecutive loads.\n"
; } } while (false)
;
3952#endif // NDEBUG
3953 break;
3954 }
3955 return;
3956 }
3957 case Instruction::ZExt:
3958 case Instruction::SExt:
3959 case Instruction::FPToUI:
3960 case Instruction::FPToSI:
3961 case Instruction::FPExt:
3962 case Instruction::PtrToInt:
3963 case Instruction::IntToPtr:
3964 case Instruction::SIToFP:
3965 case Instruction::UIToFP:
3966 case Instruction::Trunc:
3967 case Instruction::FPTrunc:
3968 case Instruction::BitCast: {
3969 Type *SrcTy = VL0->getOperand(0)->getType();
3970 for (Value *V : VL) {
3971 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
3972 if (Ty != SrcTy || !isValidElementType(Ty)) {
3973 BS.cancelScheduling(VL, VL0);
3974 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3975 ReuseShuffleIndicies);
3976 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false)
3977 << "SLP: Gathering casts with different src types.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false)
;
3978 return;
3979 }
3980 }
3981 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3982 ReuseShuffleIndicies);
3983 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of casts.\n";
} } while (false)
;
3984
3985 TE->setOperandsInOrder();
3986 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
3987 ValueList Operands;
3988 // Prepare the operand vector.
3989 for (Value *V : VL)
3990 Operands.push_back(cast<Instruction>(V)->getOperand(i));
3991
3992 buildTree_rec(Operands, Depth + 1, {TE, i});
3993 }
3994 return;
3995 }
3996 case Instruction::ICmp:
3997 case Instruction::FCmp: {
3998 // Check that all of the compares have the same predicate.
3999 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
4000 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
4001 Type *ComparedTy = VL0->getOperand(0)->getType();
4002 for (Value *V : VL) {
4003 CmpInst *Cmp = cast<CmpInst>(V);
4004 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
4005 Cmp->getOperand(0)->getType() != ComparedTy) {
4006 BS.cancelScheduling(VL, VL0);
4007 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4008 ReuseShuffleIndicies);
4009 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false)
4010 << "SLP: Gathering cmp with different predicate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false)
;
4011 return;
4012 }
4013 }
4014
4015 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4016 ReuseShuffleIndicies);
4017 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of compares.\n"
; } } while (false)
;
4018
4019 ValueList Left, Right;
4020 if (cast<CmpInst>(VL0)->isCommutative()) {
4021 // Commutative predicate - collect + sort operands of the instructions
4022 // so that each side is more likely to have the same opcode.
4023 assert(P0 == SwapP0 && "Commutative Predicate mismatch")(static_cast <bool> (P0 == SwapP0 && "Commutative Predicate mismatch"
) ? void (0) : __assert_fail ("P0 == SwapP0 && \"Commutative Predicate mismatch\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4023, __extension__
__PRETTY_FUNCTION__))
;
4024 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
4025 } else {
4026 // Collect operands - commute if it uses the swapped predicate.
4027 for (Value *V : VL) {
4028 auto *Cmp = cast<CmpInst>(V);
4029 Value *LHS = Cmp->getOperand(0);
4030 Value *RHS = Cmp->getOperand(1);
4031 if (Cmp->getPredicate() != P0)
4032 std::swap(LHS, RHS);
4033 Left.push_back(LHS);
4034 Right.push_back(RHS);
4035 }
4036 }
4037 TE->setOperand(0, Left);
4038 TE->setOperand(1, Right);
4039 buildTree_rec(Left, Depth + 1, {TE, 0});
4040 buildTree_rec(Right, Depth + 1, {TE, 1});
4041 return;
4042 }
4043 case Instruction::Select:
4044 case Instruction::FNeg:
4045 case Instruction::Add:
4046 case Instruction::FAdd:
4047 case Instruction::Sub:
4048 case Instruction::FSub:
4049 case Instruction::Mul:
4050 case Instruction::FMul:
4051 case Instruction::UDiv:
4052 case Instruction::SDiv:
4053 case Instruction::FDiv:
4054 case Instruction::URem:
4055 case Instruction::SRem:
4056 case Instruction::FRem:
4057 case Instruction::Shl:
4058 case Instruction::LShr:
4059 case Instruction::AShr:
4060 case Instruction::And:
4061 case Instruction::Or:
4062 case Instruction::Xor: {
4063 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4064 ReuseShuffleIndicies);
4065 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of un/bin op.\n"
; } } while (false)
;
4066
4067 // Sort operands of the instructions so that each side is more likely to
4068 // have the same opcode.
4069 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
4070 ValueList Left, Right;
4071 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
4072 TE->setOperand(0, Left);
4073 TE->setOperand(1, Right);
4074 buildTree_rec(Left, Depth + 1, {TE, 0});
4075 buildTree_rec(Right, Depth + 1, {TE, 1});
4076 return;
4077 }
4078
4079 TE->setOperandsInOrder();
4080 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
4081 ValueList Operands;
4082 // Prepare the operand vector.
4083 for (Value *V : VL)
4084 Operands.push_back(cast<Instruction>(V)->getOperand(i));
4085
4086 buildTree_rec(Operands, Depth + 1, {TE, i});
4087 }
4088 return;
4089 }
4090 case Instruction::GetElementPtr: {
4091 // We don't combine GEPs with complicated (nested) indexing.
4092 for (Value *V : VL) {
4093 if (cast<Instruction>(V)->getNumOperands() != 2) {
4094 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"
; } } while (false)
;
4095 BS.cancelScheduling(VL, VL0);
4096 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4097 ReuseShuffleIndicies);
4098 return;
4099 }
4100 }
4101
4102 // We can't combine several GEPs into one vector if they operate on
4103 // different types.
4104 Type *Ty0 = VL0->getOperand(0)->getType();
4105 for (Value *V : VL) {
4106 Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
4107 if (Ty0 != CurTy) {
4108 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false)
4109 << "SLP: not-vectorizable GEP (different types).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false)
;
4110 BS.cancelScheduling(VL, VL0);
4111 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4112 ReuseShuffleIndicies);
4113 return;
4114 }
4115 }
4116
4117 // We don't combine GEPs with non-constant indexes.
4118 Type *Ty1 = VL0->getOperand(1)->getType();
4119 for (Value *V : VL) {
4120 auto Op = cast<Instruction>(V)->getOperand(1);
4121 if (!isa<ConstantInt>(Op) ||
4122 (Op->getType() != Ty1 &&
4123 Op->getType()->getScalarSizeInBits() >
4124 DL->getIndexSizeInBits(
4125 V->getType()->getPointerAddressSpace()))) {
4126 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)
4127 << "SLP: not-vectorizable GEP (non-constant indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)
;
4128 BS.cancelScheduling(VL, VL0);
4129 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4130 ReuseShuffleIndicies);
4131 return;
4132 }
4133 }
4134
4135 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4136 ReuseShuffleIndicies);
4137 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of GEPs.\n"; }
} while (false)
;
4138 SmallVector<ValueList, 2> Operands(2);
4139 // Prepare the operand vector for pointer operands.
4140 for (Value *V : VL)
4141 Operands.front().push_back(
4142 cast<GetElementPtrInst>(V)->getPointerOperand());
4143 TE->setOperand(0, Operands.front());
4144 // Need to cast all indices to the same type before vectorization to
4145 // avoid crash.
4146 // Required to be able to find correct matches between different gather
4147 // nodes and reuse the vectorized values rather than trying to gather them
4148 // again.
4149 int IndexIdx = 1;
4150 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
4151 Type *Ty = all_of(VL,
4152 [VL0Ty, IndexIdx](Value *V) {
4153 return VL0Ty == cast<GetElementPtrInst>(V)
4154 ->getOperand(IndexIdx)
4155 ->getType();
4156 })
4157 ? VL0Ty
4158 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
4159 ->getPointerOperandType()
4160 ->getScalarType());
4161 // Prepare the operand vector.
4162 for (Value *V : VL) {
4163 auto *Op = cast<Instruction>(V)->getOperand(IndexIdx);
4164 auto *CI = cast<ConstantInt>(Op);
4165 Operands.back().push_back(ConstantExpr::getIntegerCast(
4166 CI, Ty, CI->getValue().isSignBitSet()));
4167 }
4168 TE->setOperand(IndexIdx, Operands.back());
4169
4170 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
4171 buildTree_rec(Operands[I], Depth + 1, {TE, I});
4172 return;
4173 }
4174 case Instruction::Store: {
4175 // Check if the stores are consecutive or if we need to swizzle them.
4176 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
4177 // Avoid types that are padded when being allocated as scalars, while
4178 // being packed together in a vector (such as i1).
4179 if (DL->getTypeSizeInBits(ScalarTy) !=
4180 DL->getTypeAllocSizeInBits(ScalarTy)) {
4181 BS.cancelScheduling(VL, VL0);
4182 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4183 ReuseShuffleIndicies);
4184 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering stores of non-packed type.\n"
; } } while (false)
;
4185 return;
4186 }
4187 // Make sure all stores in the bundle are simple - we can't vectorize
4188 // atomic or volatile stores.
4189 SmallVector<Value *, 4> PointerOps(VL.size());
4190 ValueList Operands(VL.size());
4191 auto POIter = PointerOps.begin();
4192 auto OIter = Operands.begin();
4193 for (Value *V : VL) {
4194 auto *SI = cast<StoreInst>(V);
4195 if (!SI->isSimple()) {
4196 BS.cancelScheduling(VL, VL0);
4197 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4198 ReuseShuffleIndicies);
4199 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple stores.\n"
; } } while (false)
;
4200 return;
4201 }
4202 *POIter = SI->getPointerOperand();
4203 *OIter = SI->getValueOperand();
4204 ++POIter;
4205 ++OIter;
4206 }
4207
4208 OrdersType CurrentOrder;
4209 // Check the order of pointer operands.
4210 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
4211 Value *Ptr0;
4212 Value *PtrN;
4213 if (CurrentOrder.empty()) {
4214 Ptr0 = PointerOps.front();
4215 PtrN = PointerOps.back();
4216 } else {
4217 Ptr0 = PointerOps[CurrentOrder.front()];
4218 PtrN = PointerOps[CurrentOrder.back()];
4219 }
4220 Optional<int> Dist =
4221 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4222 // Check that the sorted pointer operands are consecutive.
4223 if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
4224 if (CurrentOrder.empty()) {
4225 // Original stores are consecutive and does not require reordering.
4226 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
4227 UserTreeIdx, ReuseShuffleIndicies);
4228 TE->setOperandsInOrder();
4229 buildTree_rec(Operands, Depth + 1, {TE, 0});
4230 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of stores.\n"
; } } while (false)
;
4231 } else {
4232 fixupOrderingIndices(CurrentOrder);
4233 TreeEntry *TE =
4234 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4235 ReuseShuffleIndicies, CurrentOrder);
4236 TE->setOperandsInOrder();
4237 buildTree_rec(Operands, Depth + 1, {TE, 0});
4238 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of jumbled stores.\n"
; } } while (false)
;
4239 }
4240 return;
4241 }
4242 }
4243
4244 BS.cancelScheduling(VL, VL0);
4245 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4246 ReuseShuffleIndicies);
4247 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-consecutive store.\n"; }
} while (false)
;
4248 return;
4249 }
4250 case Instruction::Call: {
4251 // Check if the calls are all to the same vectorizable intrinsic or
4252 // library function.
4253 CallInst *CI = cast<CallInst>(VL0);
4254 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4255
4256 VFShape Shape = VFShape::get(
4257 *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
4258 false /*HasGlobalPred*/);
4259 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
4260
4261 if (!VecFunc && !isTriviallyVectorizable(ID)) {
4262 BS.cancelScheduling(VL, VL0);
4263 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4264 ReuseShuffleIndicies);
4265 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-vectorizable call.\n"; }
} while (false)
;
4266 return;
4267 }
4268 Function *F = CI->getCalledFunction();
4269 unsigned NumArgs = CI->arg_size();
4270 SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
4271 for (unsigned j = 0; j != NumArgs; ++j)
4272 if (hasVectorInstrinsicScalarOpd(ID, j))
4273 ScalarArgs[j] = CI->getArgOperand(j);
4274 for (Value *V : VL) {
4275 CallInst *CI2 = dyn_cast<CallInst>(V);
4276 if (!CI2 || CI2->getCalledFunction() != F ||
4277 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
4278 (VecFunc &&
4279 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
4280 !CI->hasIdenticalOperandBundleSchema(*CI2)) {
4281 BS.cancelScheduling(VL, VL0);
4282 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4283 ReuseShuffleIndicies);
4284 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *V << "\n"; } } while (false)
4285 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *V << "\n"; } } while (false)
;
4286 return;
4287 }
4288 // Some intrinsics have scalar arguments and should be same in order for
4289 // them to be vectorized.
4290 for (unsigned j = 0; j != NumArgs; ++j) {
4291 if (hasVectorInstrinsicScalarOpd(ID, j)) {
4292 Value *A1J = CI2->getArgOperand(j);
4293 if (ScalarArgs[j] != A1J) {
4294 BS.cancelScheduling(VL, VL0);
4295 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4296 ReuseShuffleIndicies);
4297 LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)
4298 << " argument " << ScalarArgs[j] << "!=" << A1Jdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)
4299 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)
;
4300 return;
4301 }
4302 }
4303 }
4304 // Verify that the bundle operands are identical between the two calls.
4305 if (CI->hasOperandBundles() &&
4306 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
4307 CI->op_begin() + CI->getBundleOperandsEndIndex(),
4308 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
4309 BS.cancelScheduling(VL, VL0);
4310 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4311 ReuseShuffleIndicies);
4312 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *V << '\n'; } } while
(false)
4313 << *CI << "!=" << *V << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *V << '\n'; } } while
(false)
;
4314 return;
4315 }
4316 }
4317
4318 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4319 ReuseShuffleIndicies);
4320 TE->setOperandsInOrder();
4321 for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
4322 // For scalar operands no need to to create an entry since no need to
4323 // vectorize it.
4324 if (hasVectorInstrinsicScalarOpd(ID, i))
4325 continue;
4326 ValueList Operands;
4327 // Prepare the operand vector.
4328 for (Value *V : VL) {
4329 auto *CI2 = cast<CallInst>(V);
4330 Operands.push_back(CI2->getArgOperand(i));
4331 }
4332 buildTree_rec(Operands, Depth + 1, {TE, i});
4333 }
4334 return;
4335 }
4336 case Instruction::ShuffleVector: {
4337 // If this is not an alternate sequence of opcode like add-sub
4338 // then do not vectorize this instruction.
4339 if (!S.isAltShuffle()) {
4340 BS.cancelScheduling(VL, VL0);
4341 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4342 ReuseShuffleIndicies);
4343 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: ShuffleVector are not vectorized.\n"
; } } while (false)
;
4344 return;
4345 }
4346 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4347 ReuseShuffleIndicies);
4348 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a ShuffleVector op.\n"
; } } while (false)
;
4349
4350 // Reorder operands if reordering would enable vectorization.
4351 if (isa<BinaryOperator>(VL0)) {
4352 ValueList Left, Right;
4353 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
4354 TE->setOperand(0, Left);
4355 TE->setOperand(1, Right);
4356 buildTree_rec(Left, Depth + 1, {TE, 0});
4357 buildTree_rec(Right, Depth + 1, {TE, 1});
4358 return;
4359 }
4360
4361 TE->setOperandsInOrder();
4362 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
4363 ValueList Operands;
4364 // Prepare the operand vector.
4365 for (Value *V : VL)
4366 Operands.push_back(cast<Instruction>(V)->getOperand(i));
4367
4368 buildTree_rec(Operands, Depth + 1, {TE, i});
4369 }
4370 return;
4371 }
4372 default:
4373 BS.cancelScheduling(VL, VL0);
4374 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4375 ReuseShuffleIndicies);
4376 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering unknown instruction.\n"
; } } while (false)
;
4377 return;
4378 }
4379}
4380
4381unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
4382 unsigned N = 1;
4383 Type *EltTy = T;
4384
4385 while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
4386 isa<VectorType>(EltTy)) {
4387 if (auto *ST = dyn_cast<StructType>(EltTy)) {
4388 // Check that struct is homogeneous.
4389 for (const auto *Ty : ST->elements())
4390 if (Ty != *ST->element_begin())
4391 return 0;
4392 N *= ST->getNumElements();
4393 EltTy = *ST->element_begin();
4394 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
4395 N *= AT->getNumElements();
4396 EltTy = AT->getElementType();
4397 } else {
4398 auto *VT = cast<FixedVectorType>(EltTy);
4399 N *= VT->getNumElements();
4400 EltTy = VT->getElementType();
4401 }
4402 }
4403
4404 if (!isValidElementType(EltTy))
4405 return 0;
4406 uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
4407 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
4408 return 0;
4409 return N;
4410}
4411
4412bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
4413 SmallVectorImpl<unsigned> &CurrentOrder) const {
4414 const auto *It = find_if(VL, [](Value *V) {
4415 return isa<ExtractElementInst, ExtractValueInst>(V);
4416 });
4417 assert(It != VL.end() && "Expected at least one extract instruction.")(static_cast <bool> (It != VL.end() && "Expected at least one extract instruction."
) ? void (0) : __assert_fail ("It != VL.end() && \"Expected at least one extract instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4417, __extension__
__PRETTY_FUNCTION__))
;
4418 auto *E0 = cast<Instruction>(*It);
4419 assert(all_of(VL,(static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4424, __extension__
__PRETTY_FUNCTION__))
4420 [](Value *V) {(static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4424, __extension__
__PRETTY_FUNCTION__))
4421 return isa<UndefValue, ExtractElementInst, ExtractValueInst>((static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4424, __extension__
__PRETTY_FUNCTION__))
4422 V);(static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4424, __extension__
__PRETTY_FUNCTION__))
4423 }) &&(static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4424, __extension__
__PRETTY_FUNCTION__))
4424 "Invalid opcode")(static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4424, __extension__
__PRETTY_FUNCTION__))
;
4425 // Check if all of the extracts come from the same vector and from the
4426 // correct offset.
4427 Value *Vec = E0->getOperand(0);
4428
4429 CurrentOrder.clear();
4430
4431 // We have to extract from a vector/aggregate with the same number of elements.
4432 unsigned NElts;
4433 if (E0->getOpcode() == Instruction::ExtractValue) {
4434 const DataLayout &DL = E0->getModule()->getDataLayout();
4435 NElts = canMapToVector(Vec->getType(), DL);
4436 if (!NElts)
4437 return false;
4438 // Check if load can be rewritten as load of vector.
4439 LoadInst *LI = dyn_cast<LoadInst>(Vec);
4440 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
4441 return false;
4442 } else {
4443 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
4444 }
4445
4446 if (NElts != VL.size())
4447 return false;
4448
4449 // Check that all of the indices extract from the correct offset.
4450 bool ShouldKeepOrder = true;
4451 unsigned E = VL.size();
4452 // Assign to all items the initial value E + 1 so we can check if the extract
4453 // instruction index was used already.
4454 // Also, later we can check that all the indices are used and we have a
4455 // consecutive access in the extract instructions, by checking that no
4456 // element of CurrentOrder still has value E + 1.
4457 CurrentOrder.assign(E, E);
4458 unsigned I = 0;
4459 for (; I < E; ++I) {
4460 auto *Inst = dyn_cast<Instruction>(VL[I]);
4461 if (!Inst)
4462 continue;
4463 if (Inst->getOperand(0) != Vec)
4464 break;
4465 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
4466 if (isa<UndefValue>(EE->getIndexOperand()))
4467 continue;
4468 Optional<unsigned> Idx = getExtractIndex(Inst);
4469 if (!Idx)
4470 break;
4471 const unsigned ExtIdx = *Idx;
4472 if (ExtIdx != I) {
4473 if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)
4474 break;
4475 ShouldKeepOrder = false;
4476 CurrentOrder[ExtIdx] = I;
4477 } else {
4478 if (CurrentOrder[I] != E)
4479 break;
4480 CurrentOrder[I] = I;
4481 }
4482 }
4483 if (I < E) {
4484 CurrentOrder.clear();
4485 return false;
4486 }
4487 if (ShouldKeepOrder)
4488 CurrentOrder.clear();
4489
4490 return ShouldKeepOrder;
4491}
4492
4493bool BoUpSLP::areAllUsersVectorized(Instruction *I,
4494 ArrayRef<Value *> VectorizedVals) const {
4495 return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
4496 all_of(I->users(), [this](User *U) {
4497 return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);
4498 });
4499}
4500
4501static std::pair<InstructionCost, InstructionCost>
4502getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
4503 TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
4504 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4505
4506 // Calculate the cost of the scalar and vector calls.
4507 SmallVector<Type *, 4> VecTys;
4508 for (Use &Arg : CI->args())
4509 VecTys.push_back(
4510 FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
4511 FastMathFlags FMF;
4512 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
4513 FMF = FPCI->getFastMathFlags();
4514 SmallVector<const Value *> Arguments(CI->args());
4515 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
4516 dyn_cast<IntrinsicInst>(CI));
4517 auto IntrinsicCost =
4518 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
4519
4520 auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
4521 VecTy->getNumElements())),
4522 false /*HasGlobalPred*/);
4523 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
4524 auto LibCost = IntrinsicCost;
4525 if (!CI->isNoBuiltin() && VecFunc) {
4526 // Calculate the cost of the vector library call.
4527 // If the corresponding vector call is cheaper, return its cost.
4528 LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
4529 TTI::TCK_RecipThroughput);
4530 }
4531 return {IntrinsicCost, LibCost};
4532}
4533
4534/// Compute the cost of creating a vector of type \p VecTy containing the
4535/// extracted values from \p VL.
4536static InstructionCost
4537computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
4538 TargetTransformInfo::ShuffleKind ShuffleKind,
4539 ArrayRef<int> Mask, TargetTransformInfo &TTI) {
4540 unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
4541
4542 if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||
4543 VecTy->getNumElements() < NumOfParts)
4544 return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
4545
4546 bool AllConsecutive = true;
4547 unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
4548 unsigned Idx = -1;
4549 InstructionCost Cost = 0;
4550
4551 // Process extracts in blocks of EltsPerVector to check if the source vector
4552 // operand can be re-used directly. If not, add the cost of creating a shuffle
4553 // to extract the values into a vector register.
4554 for (auto *V : VL) {
4555 ++Idx;
4556
4557 // Need to exclude undefs from analysis.
4558 if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
4559 continue;
4560
4561 // Reached the start of a new vector registers.
4562 if (Idx % EltsPerVector == 0) {
4563 AllConsecutive = true;
4564 continue;
4565 }
4566
4567 // Check all extracts for a vector register on the target directly
4568 // extract values in order.
4569 unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
4570 if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {
4571 unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
4572 AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
4573 CurrentIdx % EltsPerVector == Idx % EltsPerVector;
4574 }
4575
4576 if (AllConsecutive)
4577 continue;
4578
4579 // Skip all indices, except for the last index per vector block.
4580 if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
4581 continue;
4582
4583 // If we have a series of extracts which are not consecutive and hence
4584 // cannot re-use the source vector register directly, compute the shuffle
4585 // cost to extract the a vector with EltsPerVector elements.
4586 Cost += TTI.getShuffleCost(
4587 TargetTransformInfo::SK_PermuteSingleSrc,
4588 FixedVectorType::get(VecTy->getElementType(), EltsPerVector));
4589 }
4590 return Cost;
4591}
4592
4593/// Build shuffle mask for shuffle graph entries and lists of main and alternate
4594/// operations operands.
4595static void
4596buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
4597 ArrayRef<int> ReusesIndices,
4598 const function_ref<bool(Instruction *)> IsAltOp,
4599 SmallVectorImpl<int> &Mask,
4600 SmallVectorImpl<Value *> *OpScalars = nullptr,
4601 SmallVectorImpl<Value *> *AltScalars = nullptr) {
4602 unsigned Sz = VL.size();
4603 Mask.assign(Sz, UndefMaskElem);
4604 SmallVector<int> OrderMask;
4605 if (!ReorderIndices.empty())
4606 inversePermutation(ReorderIndices, OrderMask);
4607 for (unsigned I = 0; I < Sz; ++I) {
4608 unsigned Idx = I;
4609 if (!ReorderIndices.empty())
4610 Idx = OrderMask[I];
4611 auto *OpInst = cast<Instruction>(VL[Idx]);
4612 if (IsAltOp(OpInst)) {
4613 Mask[I] = Sz + Idx;
4614 if (AltScalars)
4615 AltScalars->push_back(OpInst);
4616 } else {
4617 Mask[I] = Idx;
4618 if (OpScalars)
4619 OpScalars->push_back(OpInst);
4620 }
4621 }
4622 if (!ReusesIndices.empty()) {
4623 SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem);
4624 transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {
4625 return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem;
4626 });
4627 Mask.swap(NewMask);
4628 }
4629}
4630
4631InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
4632 ArrayRef<Value *> VectorizedVals) {
4633 ArrayRef<Value*> VL = E->Scalars;
4634
4635 Type *ScalarTy = VL[0]->getType();
4636 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
4637 ScalarTy = SI->getValueOperand()->getType();
4638 else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
4639 ScalarTy = CI->getOperand(0)->getType();
4640 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
4641 ScalarTy = IE->getOperand(1)->getType();
4642 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
4643 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4644
4645 // If we have computed a smaller type for the expression, update VecTy so
4646 // that the costs will be accurate.
4647 if (MinBWs.count(VL[0]))
4648 VecTy = FixedVectorType::get(
4649 IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
4650 unsigned EntryVF = E->getVectorFactor();
4651 auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
4652
4653 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
4654 // FIXME: it tries to fix a problem with MSVC buildbots.
4655 TargetTransformInfo &TTIRef = *TTI;
4656 auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
4657 VectorizedVals, E](InstructionCost &Cost) {
4658 DenseMap<Value *, int> ExtractVectorsTys;
4659 SmallPtrSet<Value *, 4> CheckedExtracts;
4660 for (auto *V : VL) {
4661 if (isa<UndefValue>(V))
4662 continue;
4663 // If all users of instruction are going to be vectorized and this
4664 // instruction itself is not going to be vectorized, consider this
4665 // instruction as dead and remove its cost from the final cost of the
4666 // vectorized tree.
4667 // Also, avoid adjusting the cost for extractelements with multiple uses
4668 // in different graph entries.
4669 const TreeEntry *VE = getTreeEntry(V);
4670 if (!CheckedExtracts.insert(V).second ||
4671 !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
4672 (VE && VE != E))
4673 continue;
4674 auto *EE = cast<ExtractElementInst>(V);
4675 Optional<unsigned> EEIdx = getExtractIndex(EE);
4676 if (!EEIdx)
4677 continue;
4678 unsigned Idx = *EEIdx;
4679 if (TTIRef.getNumberOfParts(VecTy) !=
4680 TTIRef.getNumberOfParts(EE->getVectorOperandType())) {
4681 auto It =
4682 ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
4683 It->getSecond() = std::min<int>(It->second, Idx);
4684 }
4685 // Take credit for instruction that will become dead.
4686 if (EE->hasOneUse()) {
4687 Instruction *Ext = EE->user_back();
4688 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
4689 all_of(Ext->users(),
4690 [](User *U) { return isa<GetElementPtrInst>(U); })) {
4691 // Use getExtractWithExtendCost() to calculate the cost of
4692 // extractelement/ext pair.
4693 Cost -=
4694 TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
4695 EE->getVectorOperandType(), Idx);
4696 // Add back the cost of s|zext which is subtracted separately.
4697 Cost += TTIRef.getCastInstrCost(
4698 Ext->getOpcode(), Ext->getType(), EE->getType(),
4699 TTI::getCastContextHint(Ext), CostKind, Ext);
4700 continue;
4701 }
4702 }
4703 Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement,
4704 EE->getVectorOperandType(), Idx);
4705 }
4706 // Add a cost for subvector extracts/inserts if required.
4707 for (const auto &Data : ExtractVectorsTys) {
4708 auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
4709 unsigned NumElts = VecTy->getNumElements();
4710 if (Data.second % NumElts == 0)
4711 continue;
4712 if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {
4713 unsigned Idx = (Data.second / NumElts) * NumElts;
4714 unsigned EENumElts = EEVTy->getNumElements();
4715 if (Idx + NumElts <= EENumElts) {
4716 Cost +=
4717 TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
4718 EEVTy, None, Idx, VecTy);
4719 } else {
4720 // Need to round up the subvector type vectorization factor to avoid a
4721 // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
4722 // <= EENumElts.
4723 auto *SubVT =
4724 FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
4725 Cost +=
4726 TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
4727 EEVTy, None, Idx, SubVT);
4728 }
4729 } else {
4730 Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
4731 VecTy, None, 0, EEVTy);
4732 }
4733 }
4734 };
4735 if (E->State == TreeEntry::NeedToGather) {
4736 if (allConstant(VL))
4737 return 0;
4738 if (isa<InsertElementInst>(VL[0]))
4739 return InstructionCost::getInvalid();
4740 SmallVector<int> Mask;
4741 SmallVector<const TreeEntry *> Entries;
4742 Optional<TargetTransformInfo::ShuffleKind> Shuffle =
4743 isGatherShuffledEntry(E, Mask, Entries);
4744 if (Shuffle.hasValue()) {
4745 InstructionCost GatherCost = 0;
4746 if (ShuffleVectorInst::isIdentityMask(Mask)) {
4747 // Perfect match in the graph, will reuse the previously vectorized
4748 // node. Cost is 0.
4749 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n"; } } while (false)
4750 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n"; } } while (false)
4751 << "SLP: perfect diamond match for gather bundle that starts with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n"; } } while (false)
4752 << *VL.front() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n"; } } while (false)
;
4753 if (NeedToShuffleReuses)
4754 GatherCost =
4755 TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
4756 FinalVecTy, E->ReuseShuffleIndices);
4757 } else {
4758 LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: shuffled " << Entries.
size() << " entries for bundle that starts with " <<
*VL.front() << ".\n"; } } while (false)
4759 << " entries for bundle that starts with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: shuffled " << Entries.
size() << " entries for bundle that starts with " <<
*VL.front() << ".\n"; } } while (false)
4760 << *VL.front() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: shuffled " << Entries.
size() << " entries for bundle that starts with " <<
*VL.front() << ".\n"; } } while (false)
;
4761 // Detected that instead of gather we can emit a shuffle of single/two
4762 // previously vectorized nodes. Add the cost of the permutation rather
4763 // than gather.
4764 ::addMask(Mask, E->ReuseShuffleIndices);
4765 GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask);
4766 }
4767 return GatherCost;
4768 }
4769 if ((E->getOpcode() == Instruction::ExtractElement ||
4770 all_of(E->Scalars,
4771 [](Value *V) {
4772 return isa<ExtractElementInst, UndefValue>(V);
4773 })) &&
4774 allSameType(VL)) {
4775 // Check that gather of extractelements can be represented as just a
4776 // shuffle of a single/two vectors the scalars are extracted from.
4777 SmallVector<int> Mask;
4778 Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
4779 isFixedVectorShuffle(VL, Mask);
4780 if (ShuffleKind.hasValue()) {
4781 // Found the bunch of extractelement instructions that must be gathered
4782 // into a vector and can be represented as a permutation elements in a
4783 // single input vector or of 2 input vectors.
4784 InstructionCost Cost =
4785 computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
4786 AdjustExtractsCost(Cost);
4787 if (NeedToShuffleReuses)
4788 Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
4789 FinalVecTy, E->ReuseShuffleIndices);
4790 return Cost;
4791 }
4792 }
4793 if (isSplat(VL)) {
4794 // Found the broadcasting of the single scalar, calculate the cost as the
4795 // broadcast.
4796 assert(VecTy == FinalVecTy &&(static_cast <bool> (VecTy == FinalVecTy && "No reused scalars expected for broadcast."
) ? void (0) : __assert_fail ("VecTy == FinalVecTy && \"No reused scalars expected for broadcast.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4797, __extension__
__PRETTY_FUNCTION__))
4797 "No reused scalars expected for broadcast.")(static_cast <bool> (VecTy == FinalVecTy && "No reused scalars expected for broadcast."
) ? void (0) : __assert_fail ("VecTy == FinalVecTy && \"No reused scalars expected for broadcast.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4797, __extension__
__PRETTY_FUNCTION__))
;
4798 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
4799 }
4800 InstructionCost ReuseShuffleCost = 0;
4801 if (NeedToShuffleReuses)
4802 ReuseShuffleCost = TTI->getShuffleCost(
4803 TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
4804 // Improve gather cost for gather of loads, if we can group some of the
4805 // loads into vector loads.
4806 if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&
4807 !E->isAltShuffle()) {
4808 BoUpSLP::ValueSet VectorizedLoads;
4809 unsigned StartIdx = 0;
4810 unsigned VF = VL.size() / 2;
4811 unsigned VectorizedCnt = 0;
4812 unsigned ScatterVectorizeCnt = 0;
4813 const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());
4814 for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
4815 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
4816 Cnt += VF) {
4817 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4818 if (!VectorizedLoads.count(Slice.front()) &&
4819 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
4820 SmallVector<Value *> PointerOps;
4821 OrdersType CurrentOrder;
4822 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL,
4823 *SE, CurrentOrder, PointerOps);
4824 switch (LS) {
4825 case LoadsState::Vectorize:
4826 case LoadsState::ScatterVectorize:
4827 // Mark the vectorized loads so that we don't vectorize them
4828 // again.
4829 if (LS == LoadsState::Vectorize)
4830 ++VectorizedCnt;
4831 else
4832 ++ScatterVectorizeCnt;
4833 VectorizedLoads.insert(Slice.begin(), Slice.end());
4834 // If we vectorized initial block, no need to try to vectorize it
4835 // again.
4836 if (Cnt == StartIdx)
4837 StartIdx += VF;
4838 break;
4839 case LoadsState::Gather:
4840 break;
4841 }
4842 }
4843 }
4844 // Check if the whole array was vectorized already - exit.
4845 if (StartIdx >= VL.size())
4846 break;
4847 // Found vectorizable parts - exit.
4848 if (!VectorizedLoads.empty())
4849 break;
4850 }
4851 if (!VectorizedLoads.empty()) {
4852 InstructionCost GatherCost = 0;
4853 unsigned NumParts = TTI->getNumberOfParts(VecTy);
4854 bool NeedInsertSubvectorAnalysis =
4855 !NumParts || (VL.size() / VF) > NumParts;
4856 // Get the cost for gathered loads.
4857 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
4858 if (VectorizedLoads.contains(VL[I]))
4859 continue;
4860 GatherCost += getGatherCost(VL.slice(I, VF));
4861 }
4862 // The cost for vectorized loads.
4863 InstructionCost ScalarsCost = 0;
4864 for (Value *V : VectorizedLoads) {
4865 auto *LI = cast<LoadInst>(V);
4866 ScalarsCost += TTI->getMemoryOpCost(
4867 Instruction::Load, LI->getType(), LI->getAlign(),
4868 LI->getPointerAddressSpace(), CostKind, LI);
4869 }
4870 auto *LI = cast<LoadInst>(E->getMainOp());
4871 auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
4872 Align Alignment = LI->getAlign();
4873 GatherCost +=
4874 VectorizedCnt *
4875 TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
4876 LI->getPointerAddressSpace(), CostKind, LI);
4877 GatherCost += ScatterVectorizeCnt *
4878 TTI->getGatherScatterOpCost(
4879 Instruction::Load, LoadTy, LI->getPointerOperand(),
4880 /*VariableMask=*/false, Alignment, CostKind, LI);
4881 if (NeedInsertSubvectorAnalysis) {
4882 // Add the cost for the subvectors insert.
4883 for (int I = VF, E = VL.size(); I < E; I += VF)
4884 GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
4885 None, I, LoadTy);
4886 }
4887 return ReuseShuffleCost + GatherCost - ScalarsCost;
4888 }
4889 }
4890 return ReuseShuffleCost + getGatherCost(VL);
4891 }
4892 InstructionCost CommonCost = 0;
4893 SmallVector<int> Mask;
4894 if (!E->ReorderIndices.empty()) {
4895 SmallVector<int> NewMask;
4896 if (E->getOpcode() == Instruction::Store) {
4897 // For stores the order is actually a mask.
4898 NewMask.resize(E->ReorderIndices.size());
4899 copy(E->ReorderIndices, NewMask.begin());
4900 } else {
4901 inversePermutation(E->ReorderIndices, NewMask);
4902 }
4903 ::addMask(Mask, NewMask);
4904 }
4905 if (NeedToShuffleReuses)
4906 ::addMask(Mask, E->ReuseShuffleIndices);
4907 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))
4908 CommonCost =
4909 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
4910 assert((E->State == TreeEntry::Vectorize ||(static_cast <bool> ((E->State == TreeEntry::Vectorize
|| E->State == TreeEntry::ScatterVectorize) && "Unhandled state"
) ? void (0) : __assert_fail ("(E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && \"Unhandled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4912, __extension__
__PRETTY_FUNCTION__))
4911 E->State == TreeEntry::ScatterVectorize) &&(static_cast <bool> ((E->State == TreeEntry::Vectorize
|| E->State == TreeEntry::ScatterVectorize) && "Unhandled state"
) ? void (0) : __assert_fail ("(E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && \"Unhandled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4912, __extension__
__PRETTY_FUNCTION__))
4912 "Unhandled state")(static_cast <bool> ((E->State == TreeEntry::Vectorize
|| E->State == TreeEntry::ScatterVectorize) && "Unhandled state"
) ? void (0) : __assert_fail ("(E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && \"Unhandled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4912, __extension__
__PRETTY_FUNCTION__))
;
4913 assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL")(static_cast <bool> (E->getOpcode() && allSameType
(VL) && allSameBlock(VL) && "Invalid VL") ? void
(0) : __assert_fail ("E->getOpcode() && allSameType(VL) && allSameBlock(VL) && \"Invalid VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4913, __extension__
__PRETTY_FUNCTION__))
;
4914 Instruction *VL0 = E->getMainOp();
4915 unsigned ShuffleOrOp =
4916 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
4917 switch (ShuffleOrOp) {
4918 case Instruction::PHI:
4919 return 0;
4920
4921 case Instruction::ExtractValue:
4922 case Instruction::ExtractElement: {
4923 // The common cost of removal ExtractElement/ExtractValue instructions +
4924 // the cost of shuffles, if required to resuffle the original vector.
4925 if (NeedToShuffleReuses) {
4926 unsigned Idx = 0;
4927 for (unsigned I : E->ReuseShuffleIndices) {
4928 if (ShuffleOrOp == Instruction::ExtractElement) {
4929 auto *EE = cast<ExtractElementInst>(VL[I]);
4930 CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
4931 EE->getVectorOperandType(),
4932 *getExtractIndex(EE));
4933 } else {
4934 CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
4935 VecTy, Idx);
4936 ++Idx;
4937 }
4938 }
4939 Idx = EntryVF;
4940 for (Value *V : VL) {
4941 if (ShuffleOrOp == Instruction::ExtractElement) {
4942 auto *EE = cast<ExtractElementInst>(V);
4943 CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
4944 EE->getVectorOperandType(),
4945 *getExtractIndex(EE));
4946 } else {
4947 --Idx;
4948 CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
4949 VecTy, Idx);
4950 }
4951 }
4952 }
4953 if (ShuffleOrOp == Instruction::ExtractValue) {
4954 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
4955 auto *EI = cast<Instruction>(VL[I]);
4956 // Take credit for instruction that will become dead.
4957 if (EI->hasOneUse()) {
4958 Instruction *Ext = EI->user_back();
4959 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
4960 all_of(Ext->users(),
4961 [](User *U) { return isa<GetElementPtrInst>(U); })) {
4962 // Use getExtractWithExtendCost() to calculate the cost of
4963 // extractelement/ext pair.
4964 CommonCost -= TTI->getExtractWithExtendCost(
4965 Ext->getOpcode(), Ext->getType(), VecTy, I);
4966 // Add back the cost of s|zext which is subtracted separately.
4967 CommonCost += TTI->getCastInstrCost(
4968 Ext->getOpcode(), Ext->getType(), EI->getType(),
4969 TTI::getCastContextHint(Ext), CostKind, Ext);
4970 continue;
4971 }
4972 }
4973 CommonCost -=
4974 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
4975 }
4976 } else {
4977 AdjustExtractsCost(CommonCost);
4978 }
4979 return CommonCost;
4980 }
4981 case Instruction::InsertElement: {
4982 assert(E->ReuseShuffleIndices.empty() &&(static_cast <bool> (E->ReuseShuffleIndices.empty() &&
"Unique insertelements only are expected.") ? void (0) : __assert_fail
("E->ReuseShuffleIndices.empty() && \"Unique insertelements only are expected.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4983, __extension__
__PRETTY_FUNCTION__))
4983 "Unique insertelements only are expected.")(static_cast <bool> (E->ReuseShuffleIndices.empty() &&
"Unique insertelements only are expected.") ? void (0) : __assert_fail
("E->ReuseShuffleIndices.empty() && \"Unique insertelements only are expected.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4983, __extension__
__PRETTY_FUNCTION__))
;
4984 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
4985
4986 unsigned const NumElts = SrcVecTy->getNumElements();
4987 unsigned const NumScalars = VL.size();
4988 APInt DemandedElts = APInt::getZero(NumElts);
4989 // TODO: Add support for Instruction::InsertValue.
4990 SmallVector<int> Mask;
4991 if (!E->ReorderIndices.empty()) {
4992 inversePermutation(E->ReorderIndices, Mask);
4993 Mask.append(NumElts - NumScalars, UndefMaskElem);
4994 } else {
4995 Mask.assign(NumElts, UndefMaskElem);
4996 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
4997 }
4998 unsigned Offset = *getInsertIndex(VL0, 0);
4999 bool IsIdentity = true;
5000 SmallVector<int> PrevMask(NumElts, UndefMaskElem);
5001 Mask.swap(PrevMask);
5002 for (unsigned I = 0; I < NumScalars; ++I) {
5003 Optional<int> InsertIdx = getInsertIndex(VL[PrevMask[I]], 0);
5004 if (!InsertIdx || *InsertIdx == UndefMaskElem)
5005 continue;
5006 DemandedElts.setBit(*InsertIdx);
5007 IsIdentity &= *InsertIdx - Offset == I;
5008 Mask[*InsertIdx - Offset] = I;
5009 }
5010 assert(Offset < NumElts && "Failed to find vector index offset")(static_cast <bool> (Offset < NumElts && "Failed to find vector index offset"
) ? void (0) : __assert_fail ("Offset < NumElts && \"Failed to find vector index offset\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5010, __extension__
__PRETTY_FUNCTION__))
;
5011
5012 InstructionCost Cost = 0;
5013 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
5014 /*Insert*/ true, /*Extract*/ false);
5015
5016 if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
5017 // FIXME: Replace with SK_InsertSubvector once it is properly supported.
5018 unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
5019 Cost += TTI->getShuffleCost(
5020 TargetTransformInfo::SK_PermuteSingleSrc,
5021 FixedVectorType::get(SrcVecTy->getElementType(), Sz));
5022 } else if (!IsIdentity) {
5023 auto *FirstInsert =
5024 cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
5025 return !is_contained(E->Scalars,
5026 cast<Instruction>(V)->getOperand(0));
5027 }));
5028 if (isUndefVector(FirstInsert->getOperand(0))) {
5029 Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
5030 } else {
5031 SmallVector<int> InsertMask(NumElts);
5032 std::iota(InsertMask.begin(), InsertMask.end(), 0);
5033 for (unsigned I = 0; I < NumElts; I++) {
5034 if (Mask[I] != UndefMaskElem)
5035 InsertMask[Offset + I] = NumElts + I;
5036 }
5037 Cost +=
5038 TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
5039 }
5040 }
5041
5042 return Cost;
5043 }
5044 case Instruction::ZExt:
5045 case Instruction::SExt:
5046 case Instruction::FPToUI:
5047 case Instruction::FPToSI:
5048 case Instruction::FPExt:
5049 case Instruction::PtrToInt:
5050 case Instruction::IntToPtr:
5051 case Instruction::SIToFP:
5052 case Instruction::UIToFP:
5053 case Instruction::Trunc:
5054 case Instruction::FPTrunc:
5055 case Instruction::BitCast: {
5056 Type *SrcTy = VL0->getOperand(0)->getType();
5057 InstructionCost ScalarEltCost =
5058 TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
5059 TTI::getCastContextHint(VL0), CostKind, VL0);
5060 if (NeedToShuffleReuses) {
5061 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5062 }
5063
5064 // Calculate the cost of this instruction.
5065 InstructionCost ScalarCost = VL.size() * ScalarEltCost;
5066
5067 auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
5068 InstructionCost VecCost = 0;
5069 // Check if the values are candidates to demote.
5070 if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
5071 VecCost = CommonCost + TTI->getCastInstrCost(
5072 E->getOpcode(), VecTy, SrcVecTy,
5073 TTI::getCastContextHint(VL0), CostKind, VL0);
5074 }
5075 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecCost, ScalarCost);
} } while (false)
;
5076 return VecCost - ScalarCost;
5077 }
5078 case Instruction::FCmp:
5079 case Instruction::ICmp:
5080 case Instruction::Select: {
5081 // Calculate the cost of this instruction.
5082 InstructionCost ScalarEltCost =
5083 TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
5084 CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
5085 if (NeedToShuffleReuses) {
5086 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5087 }
5088 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
5089 InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
5090
5091 // Check if all entries in VL are either compares or selects with compares
5092 // as condition that have the same predicates.
5093 CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
5094 bool First = true;
5095 for (auto *V : VL) {
5096 CmpInst::Predicate CurrentPred;
5097 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
5098 if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
5099 !match(V, MatchCmp)) ||
5100 (!First && VecPred != CurrentPred)) {
5101 VecPred = CmpInst::BAD_ICMP_PREDICATE;
5102 break;
5103 }
5104 First = false;
5105 VecPred = CurrentPred;
5106 }
5107
5108 InstructionCost VecCost = TTI->getCmpSelInstrCost(
5109 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
5110 // Check if it is possible and profitable to use min/max for selects in
5111 // VL.
5112 //
5113 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
5114 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
5115 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
5116 {VecTy, VecTy});
5117 InstructionCost IntrinsicCost =
5118 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
5119 // If the selects are the only uses of the compares, they will be dead
5120 // and we can adjust the cost by removing their cost.
5121 if (IntrinsicAndUse.second)
5122 IntrinsicCost -=
5123 TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
5124 CmpInst::BAD_ICMP_PREDICATE, CostKind);
5125 VecCost = std::min(VecCost, IntrinsicCost);
5126 }
5127 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecCost, ScalarCost);
} } while (false)
;
5128 return CommonCost + VecCost - ScalarCost;
5129 }
5130 case Instruction::FNeg:
5131 case Instruction::Add:
5132 case Instruction::FAdd:
5133 case Instruction::Sub:
5134 case Instruction::FSub:
5135 case Instruction::Mul:
5136 case Instruction::FMul:
5137 case Instruction::UDiv:
5138 case Instruction::SDiv:
5139 case Instruction::FDiv:
5140 case Instruction::URem:
5141 case Instruction::SRem:
5142 case Instruction::FRem:
5143 case Instruction::Shl:
5144 case Instruction::LShr:
5145 case Instruction::AShr:
5146 case Instruction::And:
5147 case Instruction::Or:
5148 case Instruction::Xor: {
5149 // Certain instructions can be cheaper to vectorize if they have a
5150 // constant second vector operand.
5151 TargetTransformInfo::OperandValueKind Op1VK =
5152 TargetTransformInfo::OK_AnyValue;
5153 TargetTransformInfo::OperandValueKind Op2VK =
5154 TargetTransformInfo::OK_UniformConstantValue;
5155 TargetTransformInfo::OperandValueProperties Op1VP =
5156 TargetTransformInfo::OP_None;
5157 TargetTransformInfo::OperandValueProperties Op2VP =
5158 TargetTransformInfo::OP_PowerOf2;
5159
5160 // If all operands are exactly the same ConstantInt then set the
5161 // operand kind to OK_UniformConstantValue.
5162 // If instead not all operands are constants, then set the operand kind
5163 // to OK_AnyValue. If all operands are constants but not the same,
5164 // then set the operand kind to OK_NonUniformConstantValue.
5165 ConstantInt *CInt0 = nullptr;
5166 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
5167 const Instruction *I = cast<Instruction>(VL[i]);
5168 unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
5169 ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
5170 if (!CInt) {
5171 Op2VK = TargetTransformInfo::OK_AnyValue;
5172 Op2VP = TargetTransformInfo::OP_None;
5173 break;
5174 }
5175 if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
5176 !CInt->getValue().isPowerOf2())
5177 Op2VP = TargetTransformInfo::OP_None;
5178 if (i == 0) {
5179 CInt0 = CInt;
5180 continue;
5181 }
5182 if (CInt0 != CInt)
5183 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
5184 }
5185
5186 SmallVector<const Value *, 4> Operands(VL0->operand_values());
5187 InstructionCost ScalarEltCost =
5188 TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
5189 Op2VK, Op1VP, Op2VP, Operands, VL0);
5190 if (NeedToShuffleReuses) {
5191 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5192 }
5193 InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
5194 InstructionCost VecCost =
5195 TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
5196 Op2VK, Op1VP, Op2VP, Operands, VL0);
5197 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecCost, ScalarCost);
} } while (false)
;
5198 return CommonCost + VecCost - ScalarCost;
5199 }
5200 case Instruction::GetElementPtr: {
5201 TargetTransformInfo::OperandValueKind Op1VK =
5202 TargetTransformInfo::OK_AnyValue;
5203 TargetTransformInfo::OperandValueKind Op2VK =
5204 TargetTransformInfo::OK_UniformConstantValue;
5205
5206 InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
5207 Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
5208 if (NeedToShuffleReuses) {
5209 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5210 }
5211 InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
5212 InstructionCost VecCost = TTI->getArithmeticInstrCost(
5213 Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
5214 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecCost, ScalarCost);
} } while (false)
;
5215 return CommonCost + VecCost - ScalarCost;
5216 }
5217 case Instruction::Load: {
5218 // Cost of wide load - cost of scalar loads.
5219 Align Alignment = cast<LoadInst>(VL0)->getAlign();
5220 InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
5221 Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0);
5222 if (NeedToShuffleReuses) {
5223 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5224 }
5225 InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
5226 InstructionCost VecLdCost;
5227 if (E->State == TreeEntry::Vectorize) {
5228 VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0,
5229 CostKind, VL0);
5230 } else {
5231 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState")(static_cast <bool> (E->State == TreeEntry::ScatterVectorize
&& "Unknown EntryState") ? void (0) : __assert_fail (
"E->State == TreeEntry::ScatterVectorize && \"Unknown EntryState\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5231, __extension__
__PRETTY_FUNCTION__))
;
5232 Align CommonAlignment = Alignment;
5233 for (Value *V : VL)
5234 CommonAlignment =
5235 commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
5236 VecLdCost = TTI->getGatherScatterOpCost(
5237 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5238 /*VariableMask=*/false, CommonAlignment, CostKind, VL0);
5239 }
5240 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost
); } } while (false)
;
5241 return CommonCost + VecLdCost - ScalarLdCost;
5242 }
5243 case Instruction::Store: {
5244 // We know that we can merge the stores. Calculate the cost.
5245 bool IsReorder = !E->ReorderIndices.empty();
5246 auto *SI =
5247 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
5248 Align Alignment = SI->getAlign();
5249 InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
5250 Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
5251 InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
5252 InstructionCost VecStCost = TTI->getMemoryOpCost(
5253 Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
5254 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost
); } } while (false)
;
5255 return CommonCost + VecStCost - ScalarStCost;
5256 }
5257 case Instruction::Call: {
5258 CallInst *CI = cast<CallInst>(VL0);
5259 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
5260
5261 // Calculate the cost of the scalar and vector calls.
5262 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
5263 InstructionCost ScalarEltCost =
5264 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
5265 if (NeedToShuffleReuses) {
5266 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5267 }
5268 InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
5269
5270 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
5271 InstructionCost VecCallCost =
5272 std::min(VecCallCosts.first, VecCallCosts.second);
5273
5274 LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost " << VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
5275 << " (" << VecCallCost << "-" << ScalarCallCost << ")"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost " << VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
5276 << " for " << *CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost " << VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)
;
5277
5278 return CommonCost + VecCallCost - ScalarCallCost;
5279 }
5280 case Instruction::ShuffleVector: {
5281 assert(E->isAltShuffle() &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
()))) && "Invalid Shuffle Vector Operand") ? void (0)
: __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5286, __extension__
__PRETTY_FUNCTION__))
5282 ((Instruction::isBinaryOp(E->getOpcode()) &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
()))) && "Invalid Shuffle Vector Operand") ? void (0)
: __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5286, __extension__
__PRETTY_FUNCTION__))
5283 Instruction::isBinaryOp(E->getAltOpcode())) ||(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
()))) && "Invalid Shuffle Vector Operand") ? void (0)
: __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5286, __extension__
__PRETTY_FUNCTION__))
5284 (Instruction::isCast(E->getOpcode()) &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
()))) && "Invalid Shuffle Vector Operand") ? void (0)
: __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5286, __extension__
__PRETTY_FUNCTION__))
5285 Instruction::isCast(E->getAltOpcode()))) &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
()))) && "Invalid Shuffle Vector Operand") ? void (0)
: __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5286, __extension__
__PRETTY_FUNCTION__))
5286 "Invalid Shuffle Vector Operand")(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
()))) && "Invalid Shuffle Vector Operand") ? void (0)
: __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5286, __extension__
__PRETTY_FUNCTION__))
;
5287 InstructionCost ScalarCost = 0;
5288 if (NeedToShuffleReuses) {
5289 for (unsigned Idx : E->ReuseShuffleIndices) {
5290 Instruction *I = cast<Instruction>(VL[Idx]);
5291 CommonCost -= TTI->getInstructionCost(I, CostKind);
5292 }
5293 for (Value *V : VL) {
5294 Instruction *I = cast<Instruction>(V);
5295 CommonCost += TTI->getInstructionCost(I, CostKind);
5296 }
5297 }
5298 for (Value *V : VL) {
5299 Instruction *I = cast<Instruction>(V);
5300 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode")(static_cast <bool> (E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"
) ? void (0) : __assert_fail ("E->isOpcodeOrAlt(I) && \"Unexpected main/alternate opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5300, __extension__
__PRETTY_FUNCTION__))
;
5301 ScalarCost += TTI->getInstructionCost(I, CostKind);
5302 }
5303 // VecCost is equal to sum of the cost of creating 2 vectors
5304 // and the cost of creating shuffle.
5305 InstructionCost VecCost = 0;
5306 // Try to find the previous shuffle node with the same operands and same
5307 // main/alternate ops.
5308 auto &&TryFindNodeWithEqualOperands = [this, E]() {
5309 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5310 if (TE.get() == E)
5311 break;
5312 if (TE->isAltShuffle() &&
5313 ((TE->getOpcode() == E->getOpcode() &&
5314 TE->getAltOpcode() == E->getAltOpcode()) ||
5315 (TE->getOpcode() == E->getAltOpcode() &&
5316 TE->getAltOpcode() == E->getOpcode())) &&
5317 TE->hasEqualOperands(*E))
5318 return true;
5319 }
5320 return false;
5321 };
5322 if (TryFindNodeWithEqualOperands()) {
5323 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: diamond match for alternate node found.\n"
; E->dump(); }; } } while (false)
5324 dbgs() << "SLP: diamond match for alternate node found.\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: diamond match for alternate node found.\n"
; E->dump(); }; } } while (false)
5325 E->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: diamond match for alternate node found.\n"
; E->dump(); }; } } while (false)
5326 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: diamond match for alternate node found.\n"
; E->dump(); }; } } while (false)
;
5327 // No need to add new vector costs here since we're going to reuse
5328 // same main/alternate vector ops, just do different shuffling.
5329 } else if (Instruction::isBinaryOp(E->getOpcode())) {
5330 VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
5331 VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
5332 CostKind);
5333 } else {
5334 Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
5335 Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
5336 auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
5337 auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
5338 VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
5339 TTI::CastContextHint::None, CostKind);
5340 VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
5341 TTI::CastContextHint::None, CostKind);
5342 }
5343
5344 SmallVector<int> Mask;
5345 buildSuffleEntryMask(
5346 E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
5347 [E](Instruction *I) {
5348 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode")(static_cast <bool> (E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"
) ? void (0) : __assert_fail ("E->isOpcodeOrAlt(I) && \"Unexpected main/alternate opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5348, __extension__
__PRETTY_FUNCTION__))
;
5349 return I->getOpcode() == E->getAltOpcode();
5350 },
5351 Mask);
5352 CommonCost =
5353 TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask);
5354 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecCost, ScalarCost);
} } while (false)
;
5355 return CommonCost + VecCost - ScalarCost;
5356 }
5357 default:
5358 llvm_unreachable("Unknown instruction")::llvm::llvm_unreachable_internal("Unknown instruction", "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5358)
;
5359 }
5360}
5361
5362bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
5363 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false)
5364 << VectorizableTree.size() << " is fully vectorizable .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false)
;
5365
5366 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
5367 SmallVector<int> Mask;
5368 return TE->State == TreeEntry::NeedToGather &&
5369 !any_of(TE->Scalars,
5370 [this](Value *V) { return EphValues.contains(V); }) &&
5371 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
5372 TE->Scalars.size() < Limit ||
5373 ((TE->getOpcode() == Instruction::ExtractElement ||
5374 all_of(TE->Scalars,
5375 [](Value *V) {
5376 return isa<ExtractElementInst, UndefValue>(V);
5377 })) &&
5378 isFixedVectorShuffle(TE->Scalars, Mask)) ||
5379 (TE->State == TreeEntry::NeedToGather &&
5380 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
5381 };
5382
5383 // We only handle trees of heights 1 and 2.
5384 if (VectorizableTree.size() == 1 &&
5385 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
5386 (ForReduction &&
5387 AreVectorizableGathers(VectorizableTree[0].get(),
5388 VectorizableTree[0]->Scalars.size()) &&
5389 VectorizableTree[0]->getVectorFactor() > 2)))
5390 return true;
5391
5392 if (VectorizableTree.size() != 2)
5393 return false;
5394
5395 // Handle splat and all-constants stores. Also try to vectorize tiny trees
5396 // with the second gather nodes if they have less scalar operands rather than
5397 // the initial tree element (may be profitable to shuffle the second gather)
5398 // or they are extractelements, which form shuffle.
5399 SmallVector<int> Mask;
5400 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
5401 AreVectorizableGathers(VectorizableTree[1].get(),
5402 VectorizableTree[0]->Scalars.size()))
5403 return true;
5404
5405 // Gathering cost would be too much for tiny trees.
5406 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
5407 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
5408 VectorizableTree[0]->State != TreeEntry::ScatterVectorize))
5409 return false;
5410
5411 return true;
5412}
5413
5414static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
5415 TargetTransformInfo *TTI,
5416 bool MustMatchOrInst) {
5417 // Look past the root to find a source value. Arbitrarily follow the
5418 // path through operand 0 of any 'or'. Also, peek through optional
5419 // shift-left-by-multiple-of-8-bits.
5420 Value *ZextLoad = Root;
5421 const APInt *ShAmtC;
5422 bool FoundOr = false;
5423 while (!isa<ConstantExpr>(ZextLoad) &&
5424 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
5425 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
5426 ShAmtC->urem(8) == 0))) {
5427 auto *BinOp = cast<BinaryOperator>(ZextLoad);
5428 ZextLoad = BinOp->getOperand(0);
5429 if (BinOp->getOpcode() == Instruction::Or)
5430 FoundOr = true;
5431 }
5432 // Check if the input is an extended load of the required or/shift expression.
5433 Value *Load;
5434 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
5435 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
5436 return false;
5437
5438 // Require that the total load bit width is a legal integer type.
5439 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
5440 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
5441 Type *SrcTy = Load->getType();
5442 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
5443 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
5444 return false;
5445
5446 // Everything matched - assume that we can fold the whole sequence using
5447 // load combining.
5448 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Assume load combining for tree starting at "
<< *(cast<Instruction>(Root)) << "\n"; } }
while (false)
5449 << *(cast<Instruction>(Root)) << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Assume load combining for tree starting at "
<< *(cast<Instruction>(Root)) << "\n"; } }
while (false)
;
5450
5451 return true;
5452}
5453
5454bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
5455 if (RdxKind != RecurKind::Or)
5456 return false;
5457
5458 unsigned NumElts = VectorizableTree[0]->Scalars.size();
5459 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
5460 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
5461 /* MatchOr */ false);
5462}
5463
5464bool BoUpSLP::isLoadCombineCandidate() const {
5465 // Peek through a final sequence of stores and check if all operations are
5466 // likely to be load-combined.
5467 unsigned NumElts = VectorizableTree[0]->Scalars.size();
5468 for (Value *Scalar : VectorizableTree[0]->Scalars) {
5469 Value *X;
5470 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
5471 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
5472 return false;
5473 }
5474 return true;
5475}
5476
5477bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
5478 // No need to vectorize inserts of gathered values.
5479 if (VectorizableTree.size() == 2 &&
5480 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
5481 VectorizableTree[1]->State == TreeEntry::NeedToGather)
5482 return true;
5483
5484 // We can vectorize the tree if its size is greater than or equal to the
5485 // minimum size specified by the MinTreeSize command line option.
5486 if (VectorizableTree.size() >= MinTreeSize)
5487 return false;
5488
5489 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
5490 // can vectorize it if we can prove it fully vectorizable.
5491 if (isFullyVectorizableTinyTree(ForReduction))
5492 return false;
5493
5494 assert(VectorizableTree.empty()(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5496, __extension__
__PRETTY_FUNCTION__))
5495 ? ExternalUses.empty()(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5496, __extension__
__PRETTY_FUNCTION__))
5496 : true && "We shouldn't have any external users")(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5496, __extension__
__PRETTY_FUNCTION__))
;
5497
5498 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
5499 // vectorizable.
5500 return true;
5501}
5502
5503InstructionCost BoUpSLP::getSpillCost() const {
5504 // Walk from the bottom of the tree to the top, tracking which values are
5505 // live. When we see a call instruction that is not part of our tree,
5506 // query TTI to see if there is a cost to keeping values live over it
5507 // (for example, if spills and fills are required).
5508 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
5509 InstructionCost Cost = 0;
5510
5511 SmallPtrSet<Instruction*, 4> LiveValues;
5512 Instruction *PrevInst = nullptr;
5513
5514 // The entries in VectorizableTree are not necessarily ordered by their
5515 // position in basic blocks. Collect them and order them by dominance so later
5516 // instructions are guaranteed to be visited first. For instructions in
5517 // different basic blocks, we only scan to the beginning of the block, so
5518 // their order does not matter, as long as all instructions in a basic block
5519 // are grouped together. Using dominance ensures a deterministic order.
5520 SmallVector<Instruction *, 16> OrderedScalars;
5521 for (const auto &TEPtr : VectorizableTree) {
5522 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
5523 if (!Inst)
5524 continue;
5525 OrderedScalars.push_back(Inst);
5526 }
5527 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
5528 auto *NodeA = DT->getNode(A->g